In [1]:
import re
import csv
import statistics
import pandas as pd

from spacy.lang.en import English

## Creating `train` and `test` sets of TCR for R-BERT

In [2]:
# *tcr.xlsx* is the file we created using CREST. To see how you can create this file, check the CREST repository:
# https://github.com/phosseini/crest
df = pd.read_excel('../data/tcr.xlsx')

In [3]:
def string_to_idx(string):
    """
    converting string of span indices to a dictionary in form of {"span1": [], "span2": [], "signal": []}
    :param string: string of span indices in form of:
        span1 start_1:end_1 ... start_n:end_n
        span2 start_1:end_1 ... start_n:end_n
        signal start_1:end_1 ... start_n:end_n
    :return:
    """
    idx_val = {"span1": [], "span2": [], "signal": []}
    string = string.strip().split('\n')
    for index, (key, value) in enumerate(idx_val.items()):
        spans = string[index].split(' ')
        for span in spans[1:]:
            span = span.split(':')
            idx_val[key].append([int(span[0]), int(span[1])])
    return idx_val

def add_labels(x):
    x['idx'] = string_to_idx(x['idx'])
    x1 = x['idx']['span1'][0]
    x2 = x['idx']['span2'][0]
    text = x['context']
    tagged_text = text[:x1[0]] + " <e1> " + text[x1[0]:x1[1]] + " </e1> "
    tagged_text += text[x1[1]:x2[0]] + " <e2> " + text[x2[0]:x2[1]] + " </e2> "
    tagged_text += text[x2[1]:]
    return tagged_text

df['context_tagged'] = df.apply(lambda row: add_labels(row), axis=1)

In [4]:
nlp = English()
nlp.add_pipe("sentencizer")

def find_sent(sents):
    for i in range(len(sents)):
        if '<e1>' in sents[i].text and '<e2>' in sents[i].text:
            return sents[i].text
        else:
            txt = sents[i-1].text + " " + sents[i].text + " " + sents[i+1].text
            if '<e1>' in txt and '<e2>' in txt:
                return txt
    return

def create_split(data, file_name):
    with open(file_name, 'w') as tsvfile:
        writer = csv.writer(tsvfile, delimiter='\t')
        for idx, row in data.iterrows():
            label = 'Cause-Effect(e1,e2)' if row['direction'] == 0 else 'Cause-Effect(e2,e1)'
            sent = find_sent(list(nlp(row['context_tagged']).sents))
            sent = sent.replace('\n', ' ')
            sent = sent.replace('"', ' ')
            sent = re.sub(' +', ' ', sent)
            writer.writerow([label, sent.strip()])

train = df.loc[df['split'] == 1]
test = df.loc[df['split'] == 2]
create_split(train, '../data/train.tsv')
create_split(test, '../data/test.tsv')

### Checking tags and sequence length

In [5]:
splits = ['train', 'test']
for split in splits:
    data = pd.read_csv('../data/{}.tsv'.format(split), sep='\t', names=["label", "text"])
    sequence_lengths = list()
    for idx, row in data.iterrows():
        sequence_lengths.append(len(row['text'].split(' ')))
        if not any(tag in row['text'] for tag in ['<e1>', '<e2>', '</e1>', '</e2>']):
            print('missing tag!')
    print('- avg. sequence (seq.) length in [{}]: {}'.format(split, statistics.mean(sequence_lengths)))
    print("- min seq. length: {}\n- max seq. length: {}\n".format(min(sequence_lengths), max(sequence_lengths)))

- avg. sequence (seq.) length in [train]: 70.28125
- min seq. length: 25
- max seq. length: 115

- avg. sequence (seq.) length in [test]: 65.36363636363636
- min seq. length: 20
- max seq. length: 100

