In [3]:
# !wget https://huggingface.co/datasets/Babelscape/rebel-dataset/resolve/main/rebel_dataset.zip
# !unzip rebel_dataset.zip
!ls en_*.jsonl

en_test.jsonl  en_train.jsonl  en_val.jsonl


In [4]:
import re

def rebel_format(triplets):
    """
    Convert
    [['Bruno Santana', 'participant of', '2004 Summer Olympics'],
    ['Bruno Santana', 'participant of', '2008 Summer Olympics'],
    ['Bruno Santana', 'country of citizenship', 'Brazil']]
    to rebel format,
    <triplet> Bruno Santana <subj> 2004 Summer Olympics <obj> participant of <subj> 2008 Summer Olympics <obj> participant of <subj> Brazil <obj> country of citizenship
    """
    q = []
    for no, triple in enumerate(triplets):
        obj = ['<obj>'] + triple[1].split()
        subj = ['<subj>'] + triple[2].split()
        if no > 0 and triple[0] == triplets[no - 1][0]:
            q.extend(subj + obj)
        else:
            triplet = ['<triplet>'] + triple[0].split()
            q.extend(triplet + subj + obj)
    
    return re.sub(r'[ ]+', ' ', ' '.join(q)).strip()

In [5]:
from glob import glob

files = glob('en_*.jsonl')

In [12]:
import json
from tqdm import tqdm

texts = []

for f in files:
    with open(f) as fopen:
        for l in tqdm(fopen):
            data = json.loads(l)
            triples = []
            for t in data['triples']:
                triples.append([t['subject']['surfaceform'], t['predicate']['surfaceform'], t['object']['surfaceform']])
            kg = rebel_format(triples).strip()
            text = data['text'].strip()
            texts.extend([text, kg])

152836it [00:04, 37775.30it/s]
152673it [00:04, 37845.76it/s]
2754388it [01:12, 37902.12it/s]


In [13]:
texts = [t for t in texts if len(t)]
texts = list(set(texts))
len(texts)

5691048

In [17]:
len(texts) // 3e5

18.0

In [14]:
texts[:10]

['Ham salad is a traditional Anglo-American salad. Ham salad resembles chicken salad, egg salad, and tuna salad (as well as starch-based salads like potato salad, macaroni salad, and pea salad): the primary ingredient, ham, is mixed with smaller amounts of chopped vegetables or relishes, and the whole is bound with liberal amounts of a mayonnaise, salad cream, or other similar style of salad dressing, such as Miracle Whip.',
 '<triplet> Juliette Nesville <subj> 30 July 1869 <obj> date of birth <subj> 26 July 1900 <obj> date of death <triplet> Robert Planquette <subj> Paris Conservatoire <obj> educated at',
 '<triplet> Museo de Arte Moderno y Contemporáneo de Santander y Cantabria <subj> art museum <obj> instance of <subj> Santander, Spain <obj> located in the administrative territorial entity',
 'Steve Mantis (born 1950) is a Canadian advocate for injured workers and people with disabilities. Best known for years of volunteer efforts to build a "fair and comprehensive" system for worke

In [15]:
with open('rebel.jsonl', 'w') as fopen:
    for t in texts:
        fopen.write(f'{json.dumps(t)}\n')

In [18]:
!split -l 300000 -d --additional-suffix=.splitted rebel.jsonl rebel.jsonl

In [20]:
!tail -n 10 rebel.jsonl00.splitted.requested

{"src": "Paul Hogarth, OBE, RA (born Arthur Paul Hoggarth) (4 October 1917\u00a0\u2013 27 December 2001) was an English artist and illustrator. He is best known for the cover drawings that he prepared in the 1980s for the Penguin edition of Graham Greene's books. He had distant connection with William Hogarth, whose father was also born Hoggarth.", "r": "Paul Hogarth, OBE, RA (lahir Arthur Paul Hoggarth) (4 Oktober 1917 - 27 Disember 2001) ialah seorang artis dan ilustrator Inggeris. Dia terkenal dengan lukisan muka depan yang disediakannya pada tahun 1980-an untuk edisi Penguin buku Graham Greene. Dia mempunyai hubungan yang jauh dengan William Hogarth, yang bapanya juga dilahirkan Hoggarth."}
{"src": "<triplet> I Only Get This Way with You <subj> country music <obj> genre <subj> Rick Trevino <obj> performer <subj> Learning as You Go <obj> part of <triplet> Rick Trevino <subj> country music <obj> genre <triplet> Hot Country Singles & Tracks <subj> Billboard <obj> publisher", "r": "<t