In [1]:
# !wget https://huggingface.co/datasets/Babelscape/rebel-dataset/resolve/main/rebel_dataset.zip
# !unzip rebel_dataset.zip
!ls en_*.jsonl

en_test.jsonl  en_train.jsonl  en_val.jsonl


In [7]:
import re
import json

def rebel_format(triplets):
    """
    Convert
    [['Bruno Santana', 'participant of', '2004 Summer Olympics'],
    ['Bruno Santana', 'participant of', '2008 Summer Olympics'],
    ['Bruno Santana', 'country of citizenship', 'Brazil']]
    to rebel format,
    <triplet> Bruno Santana <subj> 2004 Summer Olympics <obj> participant of <subj> 2008 Summer Olympics <obj> participant of <subj> Brazil <obj> country of citizenship
    """
    q = []
    for no, triple in enumerate(triplets):
        obj = ['<obj>'] + triple[1].split()
        subj = ['<subj>'] + triple[2].split()
        if no > 0 and triple[0] == triplets[no - 1][0]:
            q.extend(subj + obj)
        else:
            triplet = ['<triplet>'] + triple[0].split()
            q.extend(triplet + subj + obj)
    
    return re.sub(r'[ ]+', ' ', ' '.join(q)).strip()

def parse_rebel(text):
    triplets = []
    relation, subject, relation, object_ = '', '', '', ''
    text = text.strip()
    current = 'x'
    for token in text.replace('<s>', '').replace("<pad>", '').replace('</s>', '').split():
        if token == '<triplet>':
            current = 't'
            if relation != '':
                triplets.append(
                    {'head': subject.strip(), 'type': relation.strip(), 'tail': object_.strip()})
                relation = ''
            subject = ''
        elif token == '<subj>':
            current = 's'
            if relation != '':
                triplets.append(
                    {'head': subject.strip(), 'type': relation.strip(), 'tail': object_.strip()})
            object_ = ''
        elif token == '<obj>':
            current = 'o'
            relation = ''
        else:
            if current == 't':
                subject += ' ' + token
            elif current == 's':
                object_ += ' ' + token
            elif current == 'o':
                relation += ' ' + token
    if subject != '' and relation != '' and object_ != '':
        triplets.append({'head': subject.strip(),
                         'type': relation.strip(),
                         'tail': object_.strip()})
    return triplets

In [8]:
from glob import glob

files = glob('en_*.jsonl')
files

['en_test.jsonl', 'en_val.jsonl', 'en_train.jsonl']

In [15]:
mapping = {}
for f in glob('rebel.jsonl*.splitted.requested'):
    with open(f) as fopen:
        for l in fopen:
            l = json.loads(l)
            mapping[l['src']] = l['r']
len(mapping)

5399956

In [67]:
!rm en_train.translated.jsonl

In [69]:
from tqdm import tqdm

for f in files:
    with open(f'{f}.translated', 'w') as fopen_l:
        with open(f) as fopen:
            for l in tqdm(fopen):
                data = json.loads(l)
                triples = []
                for t in data['triples']:
                    triples.append([t['subject']['surfaceform'], t['predicate']['surfaceform'], t['object']['surfaceform']])
                kg = rebel_format(triples).strip()
                text = data['text'].strip()
                kg_ms = mapping.get(kg)
                if isinstance(kg_ms, str):
                    kg_ms = kg_ms.replace('Tempat terbang', 'triplet')
                    if kg_ms[:1] == '"':
                        kg_ms = kg_ms[1:]
                    if kg_ms[-1] == '"':
                        kg_ms = kg_ms[:-1]
                    if kg_ms[:1] == "'":
                        kg_ms = kg_ms[1:]
                    if kg_ms[-1] == "'":
                        kg_ms = kg_ms[:-1]
                    try:
                        kg_ms = parse_rebel(kg_ms)
                    except Exception as e:
                        print(e)
                        kg_ms = None
                data['triples_ms'] = kg_ms
                data['text_ms'] = mapping.get(text)
                fopen_l.write(f'{json.dumps(data)}\n')

152836it [00:09, 15816.80it/s]
152673it [00:09, 15954.21it/s]
2754388it [02:49, 16275.65it/s]


In [71]:
!tail -n 1 en_train.jsonl.translated

{"docid": "33187252", "title": "Betadevario ramachandrani", "uri": "Q167998", "text": "Betadevario ramachandrani is a species of cyprinid fish that is found only in the upper Seetha River drainage in Karnataka, India. It is the only member of its genus, Betadevario. It was found in cascade and riffle-pools of a high-altitude stream.", "entities": [{"uri": "Q35047", "boundaries": [42, 50], "surfaceform": "cyprinid", "annotator": "Me"}, {"uri": "Q24930508", "boundaries": [88, 100], "surfaceform": "Seetha River", "annotator": "Me"}, {"uri": "Q1185", "boundaries": [113, 122], "surfaceform": "Karnataka", "annotator": "Me"}, {"uri": "Q668", "boundaries": [124, 129], "surfaceform": "India", "annotator": "Me"}, {"uri": "Q167998", "boundaries": [0, 25], "surfaceform": "Betadevario ramachandrani", "annotator": "Me"}], "triples": [{"subject": {"uri": "Q24930508", "boundaries": [88, 100], "surfaceform": "Seetha River", "annotator": "Me"}, "predicate": {"uri": "P131", "boundaries": null, "surfacefo