## Binary structure classification used in tree building: Step 1. Negative samples generation

Create train and test sets; Save negative samples of file ``filename.rs3`` as `filename.neg`

Output:
 - ``data/*.neg``
 - ``data_structure/*``

In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
%%writefile symbol_map.py

SYMBOL_MAP = {
    '—': '-',
    '“': '«',
    '‘': '«',
    '”': '»',
    '’': '»',
    '😆': '😄',
    '😊': '😄',
    '😑': '😄',
    '😔': '😄',
    '😉': '😄',
    '❗': '😄',
    '🤔': '😄',
    '😅': '😄',
    '⚓': '😄',
    'ε': 'α',
    'ζ': 'α',
    'η': 'α',
    'μ': 'α',
    'δ': 'α',
    'λ': 'α',
    'ν': 'α',
    'β': 'α',
    'γ': 'α',
    'と': '尋',
    'の': '尋',
    '神': '尋',
    '隠': '尋',
    'し': '尋',
    'è': 'e',
    'ĕ': 'e',
    'ç': 'c',
    'ҫ': 'c',
    'ё': 'е',
    'Ё': 'Е',
    u'ú': 'u',
    u'Î': 'I',
    u'Ç': 'C',
    u'Ҫ': 'C',
    '£': '$',
    '₽': '$',
    'ӑ': 'a',
    'Ă': 'A',
}


In [None]:
import glob
import sys

sys.path.append('../')
sys.path.append('../../')
sys.path.append('../../../')

import pandas as pd
from isanlp.annotation_rst import DiscourseUnit
from _isanlp_rst.src.isanlp_rst.rst_tree_predictor import RSTTreePredictor, GoldTreePredictor
from tqdm import tqdm_notebook as tqdm
from utils.file_reading import *

In [None]:
class RandomNegativeGenerator(object):
    def __call__(self, edus, corpus, annot_text):
        new_set = self.create_training_set(edus, corpus)
        result = []
        for item in new_set:
            result.append((filename, item[0], item[1], item[2]))

        tmp = pd.DataFrame(result, columns=['filename', 'snippet_x', 'snippet_y', 'relation'])

        def place_locations(row):
            row['loc_x'] = annot_text.find(row.snippet_x)
            row['loc_y'] = annot_text.find(row.snippet_y, row['loc_x'] + 1)
            return row

        return tmp.apply(place_locations, axis=1)
    
    def __name__(self):
        return 'RandomNegativeGenerator'
    
    def create_training_set(self, edus, gold):
        training_set = []
        
        snippet_cache = []
        for num, e in enumerate(gold.index):
            snippet_x = gold.loc[e, 'snippet_x']
            cache_x = self.extract_snippet_ids(snippet_x, edus)

            snippet_y = gold.loc[e, 'snippet_y']
            cache_y = self.extract_snippet_ids(snippet_y, edus)

            if cache_x and cache_y:
                snippet_cache.append((cache_x, snippet_x))
                snippet_cache.append((cache_y, snippet_y))

        for i in range(len(edus) - 1):
            if not self.check_snippet_pair_in_dataset(gold, edus[i], edus[i+1]):
                training_set.append((edus[i], edus[i+1], False))

        for i in gold.index:
            training_set += self.extract_negative_samples_for_snippet(gold, edus, gold.loc[i, 'snippet_x'])
            training_set += self.extract_negative_samples_for_snippet(gold, edus, gold.loc[i, 'snippet_y'])

        for i in range(len(snippet_cache)):
            for j in range(i, len(snippet_cache)):
                cache_i, snippet_i = snippet_cache[i]
                cache_j, snippet_j = snippet_cache[j]

                if cache_i[-1] + 1 == cache_j[0]:
                    if not self.check_snippet_pair_in_dataset(gold, snippet_i, snippet_j):
                        training_set.append((snippet_i, snippet_j, False))

                if cache_j[-1] + 1 == cache_i[0]:
                    if not self.check_snippet_pair_in_dataset(gold, snippet_j, snippet_i):
                        training_set.append((snippet_j, snippet_i, False))

        return list(set(training_set))
    
    def extract_snippet_ids(self, snippet, edus):
        return [edu_nm for edu_nm, edu in enumerate(edus) if (edu in snippet)]
    
    def check_snippet_pair_in_dataset(self, dataset, snippet_left, snippet_right):
        return ((((dataset.snippet_x == snippet_left) & (dataset.snippet_y == snippet_right)).sum(axis=0) != 0) 
                or ((dataset.snippet_y == snippet_left) & (dataset.snippet_x == snippet_right)).sum(axis=0) != 0)
    
    def extract_negative_samples_for_snippet(self, gold, edus, snippet):
        training_set = []

        snippet_ids = self.extract_snippet_ids(snippet, edus)

        if not snippet_ids:
            return []

        if snippet_ids[0] > 0:
            if not self.check_snippet_pair_in_dataset(gold, snippet, edus[snippet_ids[0] - 1]):
                training_set.append((edus[snippet_ids[0] - 1], snippet, False))

        if snippet_ids[-1] < len(edus) - 1:
            if not self.check_snippet_pair_in_dataset(gold, snippet, edus[snippet_ids[-1] + 1]):
                training_set.append((snippet, edus[snippet_ids[-1] + 1], False))

        return training_set

In [None]:
import sys
import numpy as np


class GreedyNegativeGenerator:
    """ Inversed greedy parser based on gold tree predictor. """

    def __init__(self):
        self.forest_threshold = 0.01
        self._same_sentence_bonus = 0

    def __call__(self, edus, corpus,
                 annot_text, annot_tokens,
                 annot_sentences,
                 annot_lemma, annot_morph, annot_postag,
                 annot_syntax_dep_tree):

        def to_merge(scores):
            return np.argmax(np.array(scores))

        negative_nodes = []

        self.tree_predictor = GoldTreePredictor(corpus)
        nodes = edus
        max_id = edus[-1].id

        # initialize scores
        features = self.tree_predictor.initialize_features(nodes,
                                                           annot_text, annot_tokens,
                                                           annot_sentences,
                                                           annot_lemma, annot_morph, annot_postag,
                                                           annot_syntax_dep_tree)

        scores = self.tree_predictor.predict_pair_proba(features, _same_sentence_bonus=self._same_sentence_bonus)

        for i, score in enumerate(scores):
            if score == 0:
                negative_nodes.append(
                    DiscourseUnit(
                        id=None,
                        left=nodes[i],
                        right=nodes[i + 1],
                        relation='no_relation',
                        nuclearity='NN',
                        proba=score,
                        text=annot_text[nodes[i].start:nodes[i + 1].end].strip()
                    ))

        while len(nodes) > 2 and any([score > self.forest_threshold for score in scores]):
            # select two nodes to merge
            j = to_merge(scores)  # position of the pair in list

            # make the new node by merging node[j] + node[j+1]
            relation = self.tree_predictor.predict_label(features.iloc[j])
            relation, nuclearity = relation.split('_')

            temp = DiscourseUnit(
                id=max_id + 1,
                left=nodes[j],
                right=nodes[j + 1],
                relation=relation,
                nuclearity=nuclearity,
                proba=scores[j],
                text=annot_text[nodes[j].start:nodes[j + 1].end].strip()
            )

            max_id += 1

            # modify the node list
            nodes = nodes[:j] + [temp] + nodes[j + 2:]

            # modify the scores list
            if j == 0:
                _features = self.tree_predictor.extract_features(nodes[j], nodes[j + 1],
                                                                 annot_text, annot_tokens,
                                                                 annot_sentences,
                                                                 annot_lemma, annot_morph, annot_postag,
                                                                 annot_syntax_dep_tree)

                _scores = self.tree_predictor.predict_pair_proba(_features,
                                                                 _same_sentence_bonus=self._same_sentence_bonus)
                scores = _scores + scores[j + 2:]
                features = pd.concat([_features, features.iloc[j + 2:]])

                if _scores[0] == 0:
                    negative_nodes.append(
                        DiscourseUnit(
                            id=None,
                            left=nodes[j],
                            right=nodes[j + 1],
                            relation='no_relation',
                            nuclearity='NN',
                            proba=_scores[0],
                            text=annot_text[nodes[j].start:nodes[j + 1].end].strip()
                        ))

            elif j + 1 < len(nodes):
                _features = self.tree_predictor.initialize_features([nodes[j - 1], nodes[j], nodes[j + 1]],
                                                                    annot_text, annot_tokens,
                                                                    annot_sentences,
                                                                    annot_lemma, annot_morph, annot_postag,
                                                                    annot_syntax_dep_tree)

                _scores = self.tree_predictor.predict_pair_proba(_features,
                                                                 _same_sentence_bonus=self._same_sentence_bonus)
                features = pd.concat([features.iloc[:j - 1], _features, features.iloc[j + 2:]])
                scores = scores[:j - 1] + _scores + scores[j + 2:]

                if _scores[0] == 0:
                    negative_nodes.append(
                        DiscourseUnit(
                            id=None,
                            left=nodes[j - 1],
                            right=nodes[j],
                            relation='no_relation',
                            nuclearity='NN',
                            proba=_scores[0],
                            text=annot_text[nodes[j - 1].start:nodes[j].end].strip()
                        ))

                if _scores[1] == 0:
                    negative_nodes.append(
                        DiscourseUnit(
                            id=None,
                            left=nodes[j],
                            right=nodes[j + 1],
                            relation='no_relation',
                            nuclearity='NN',
                            proba=_scores[1],
                            text=annot_text[nodes[j].start:nodes[j + 1].end].strip()
                        ))

            else:
                _features = self.tree_predictor.extract_features(nodes[j - 1], nodes[j],
                                                                 annot_text, annot_tokens,
                                                                 annot_sentences,
                                                                 annot_lemma, annot_morph, annot_postag,
                                                                 annot_syntax_dep_tree)

                _scores = self.tree_predictor.predict_pair_proba(_features,
                                                                 _same_sentence_bonus=self._same_sentence_bonus)
                scores = scores[:j - 1] + _scores
                features = pd.concat([features.iloc[:j - 1], _features])

                if _scores[0] == 0:
                    negative_nodes.append(
                        DiscourseUnit(
                            id=None,
                            left=nodes[j - 1],
                            right=nodes[j],
                            relation='no_relation',
                            nuclearity='NN',
                            proba=_scores,
                            text=annot_text[nodes[j - 1].start:nodes[j].end].strip()
                        ))

        return negative_nodes

    def __name__(self):
        return 'GreedyNegativeGenerator'


### Make negative samples, save them

In [None]:
gen1 = RandomNegativeGenerator()
## gen2 = GreedyNegativeGenerator()

paths = ['./data_ru/*.gold.pkl']
for path in paths:
    for filename in tqdm(glob.glob(path)):
        filename = filename.replace('.gold.pkl', '')
        df = read_gold(filename, features=True)
        edus = read_edus(filename)
        
        annot = read_annotation(filename)

        tmp = gen1(edus, df, annot['text'])

        tmp = tmp[(tmp.loc_x < tmp.loc_y) & (tmp.loc_x > -1)]

        tt = pd.concat([df, tmp])
        tt['relation'] = tt.relation.map(lambda row: False if row == False else True)
        tt = tt.sort_values('relation', ascending=False).drop_duplicates(
            ['filename', 'snippet_x', 'snippet_y'])
        tmp = tt[tt.relation == False]

        tmp.drop_duplicates(['snippet_x', 'snippet_y']).reset_index()[
            ['filename', 'snippet_x', 'snippet_y', 'relation', 'loc_x', 'loc_y']].to_json(filename + '.json.neg')

In [None]:
tmp.shape  # > 11 ?

### Extract features

In [None]:
%%time
from _isanlp_rst.src.isanlp_rst.features_processor_default import FeaturesProcessor

features_processor = FeaturesProcessor(model_dir_path='../../models', verbose=0)

In [None]:
from tqdm import tqdm_notebook as tqdm

MAX_LEN = 10000

In [None]:
paths = ['data_ru/*.json.neg']#, 'dep_data/*.json.neg']
for path in paths:
    for filename in tqdm(glob.glob(path)):    
        filename = filename.replace('.json.neg', '')

        df = read_negative(filename).drop(columns=['loc_y'])
        df = df[df.snippet_x.str.len() > 0]
        df = df[df.snippet_y.str.len() > 0]

        annotation = read_annotation(filename)

        result = features_processor(df,
                                    annotation['text'],
                                    annotation['tokens'],
                                    annotation['sentences'],
                                    annotation['lemma'],
                                    annotation['morph'],
                                    annotation['postag'],
                                    annotation['syntax_dep_tree'])

        result = result[result.is_broken == False]

        result = result[result.tokens_x.map(len) < MAX_LEN]
        result = result[result.tokens_y.map(len) < MAX_LEN]

        result.to_pickle(filename + '.neg.features')

### Make train/test splits 

In [None]:
from utils.train_test_split import split_rstreebank, split_essays

print('Loading RSTreebank:')
train, dev, test = split_rstreebank('./data_ru')
print('Train length:', len(train), 'Dev length:', len(dev), 'Test length:', len(test), '(files)')

In [None]:
import pandas as pd
from utils.file_reading import read_gold


random_state = 45

train_samples = []
test_samples = []
dev_samples = []

for file in tqdm(train):
    gold = read_gold(file.replace('.edus', ''), features=True)
    gold['relation'] = 1
    train_samples.append(gold)
    negative = read_negative(file.replace('.edus', ''), features=True)
    negative['relation'] = 0
    train_samples.append(negative)

for file in tqdm(dev):
    gold = read_gold(file.replace('.edus', ''), features=True)
    gold['relation'] = 1
    dev_samples.append(gold)
    negative = read_negative(file.replace('.edus', ''), features=True)
    negative['relation'] = 0
    dev_samples.append(negative)
    
for file in tqdm(test):
    gold = read_gold(file.replace('.edus', ''), features=True)
    gold['relation'] = 1
    test_samples.append(gold)
    negative = read_negative(file.replace('.edus', ''), features=True)
    negative['relation'] = 0
    test_samples.append(negative)

train_samples = pd.concat(train_samples)
dev_samples = pd.concat(dev_samples)
test_samples = pd.concat(test_samples)

In [None]:
import os
from utils.prepare_sequence import _prepare_sequence


def correct_samples(row):
    if row.snippet_x[0] in (',', '.', '!', '?'):
        row.snippet_x = row.snippet_x[1:].strip()
    if row.snippet_y[0] in (',', '.'):
        row.snippet_x += row.snippet_y[0]
        row.snippet_y = row.snippet_y[1:].strip()
    return row

def prepare_data(data, max_len=10000000):
    data = data[data.tokens_x.map(len) < max_len]
    data = data[data.tokens_y.map(len) < max_len]
    
    data['snippet_x'] = data.tokens_x.map(lambda row: ' '.join(row))
    data['snippet_y'] = data.tokens_y.map(lambda row: ' '.join(row))
    
    data = data.apply(correct_samples, axis=1)
    
    data = data[data.snippet_x.map(len) > 0]
    data = data[data.snippet_y.map(len) > 0]
    
    data['snippet_x'] = data.snippet_x.map(_prepare_sequence)
    data['snippet_y'] = data.snippet_y.map(_prepare_sequence)
    
    data = data.sort_values(['relation'], ascending=True).drop_duplicates(['snippet_x', 'snippet_y'], keep='last')
    data = data.sample(frac=1, random_state=random_state).reset_index(drop=True)
    
    return data


train_samples = prepare_data(train_samples)
dev_samples = prepare_data(dev_samples)
test_samples = prepare_data(test_samples)

OUT_PATH = 'data_structure'
if not os.path.isdir(OUT_PATH):
    os.mkdir(OUT_PATH)

train_samples.to_pickle(os.path.join(OUT_PATH, 'train_samples.pkl'))
dev_samples.to_pickle(os.path.join(OUT_PATH, 'dev_samples.pkl'))
test_samples.to_pickle(os.path.join(OUT_PATH, 'test_samples.pkl'))

In [None]:
train_samples[['snippet_x', 'snippet_y', 'relation', 'filename']].sort_values('snippet_x').tail(1).values

In [None]:
train_samples['len_x'] = train_samples.snippet_x.map(lambda row: len(row.split()))

In [None]:
train_samples.relation.value_counts()