In [20]:
from glob import glob
import json
import os
import re
from tqdm import tqdm

def rebel_format(triplets):
    """
    Convert
    [['Bruno Santana', 'participant of', '2004 Summer Olympics'],
    ['Bruno Santana', 'participant of', '2008 Summer Olympics'],
    ['Bruno Santana', 'country of citizenship', 'Brazil']]
    to rebel format,
    <triplet> Bruno Santana <subj> 2004 Summer Olympics <obj> participant of <subj> 2008 Summer Olympics <obj> participant of <subj> Brazil <obj> country of citizenship
    """
    q = []
    for no, triple in enumerate(triplets):
        obj = ['<obj>'] + triple[1].split()
        subj = ['<subj>'] + triple[2].split()
        if no > 0 and triple[0] == triplets[no - 1][0]:
            q.extend(subj + obj)
        else:
            triplet = ['<triplet>'] + triple[0].split()
            q.extend(triplet + subj + obj)
    
    return re.sub(r'[ ]+', ' ', ' '.join(q)).strip()

def parse_rebel(text):
    triplets = []
    relation, subject, relation, object_ = '', '', '', ''
    text = text.strip()
    current = 'x'
    for token in text.replace('<s>', '').replace("<pad>", '').replace('</s>', '').split():
        if token == '<triplet>':
            current = 't'
            if relation != '':
                triplets.append(
                    {'head': subject.strip(), 'type': relation.strip(), 'tail': object_.strip()})
                relation = ''
            subject = ''
        elif token == '<subj>':
            current = 's'
            if relation != '':
                triplets.append(
                    {'head': subject.strip(), 'type': relation.strip(), 'tail': object_.strip()})
            object_ = ''
        elif token == '<obj>':
            current = 'o'
            relation = ''
        else:
            if current == 't':
                subject += ' ' + token
            elif current == 's':
                object_ += ' ' + token
            elif current == 'o':
                relation += ' ' + token
    if subject != '' and relation != '' and object_ != '':
        triplets.append({'head': subject.strip(),
                         'type': relation.strip(),
                         'tail': object_.strip()})
    return triplets

In [2]:
files = ['kg-astroawani.jsonl', 'kg-paragraph-wikipedia.jsonl']

In [4]:
mapping = {}
for f in glob('chatgpt-kg-triplets.jsonl*.splitted.requested'):
    with open(f) as fopen:
        for l in fopen:
            l = json.loads(l)
            mapping[l['src']] = l['r']
            
len(mapping)

213313

In [21]:
def parse_kg_ms(kg_ms):
    if isinstance(kg_ms, str):
        kg_ms = kg_ms.replace('Tempat terbang', 'triplet')
        if kg_ms[:1] == '"':
            kg_ms = kg_ms[1:]
        if kg_ms[-1] == '"':
            kg_ms = kg_ms[:-1]
        if kg_ms[:1] == "'":
            kg_ms = kg_ms[1:]
        if kg_ms[-1] == "'":
            kg_ms = kg_ms[:-1]
        try:
            kg_ms = parse_rebel(kg_ms)
        except Exception as e:
            print(e)
            kg_ms = None
    return kg_ms

In [22]:
with open('kg-astroawani.translated.jsonl', 'w') as fopen_l:

    with open(os.path.join('/home/husein/ssd3/kg', 'kg-astroawani.jsonl')) as fopen:
        for l in tqdm(fopen):
            l = json.loads(l)

            if l['title_kg']:
                triples = []
                for t in l['title_kg']['triplets']:
                    triples.append([t['subject'], t['predicate'], t['object']])
                kg = rebel_format(triples).strip()
                kg_ms = mapping.get(kg)
                kg_ms = parse_kg_ms(kg_ms)

                l['title_kg_ms'] = kg_ms

            if l['description_kg']:
                triples = []
                for t in l['description_kg']['triplets']:
                    triples.append([t['subject'], t['predicate'], t['object']])
                kg = rebel_format(triples).strip()
                kg_ms = mapping.get(kg)
                kg_ms = parse_kg_ms(kg_ms)

                l['description_kg_ms'] = kg_ms

            body_kg_ms = []
            for row in l['body_kg']:
                if row[1]:
                    triples = []
                    for t in row[1]['triplets']:
                        triples.append([t['subject'], t['predicate'], t['object']])
                    kg = rebel_format(triples).strip()
                    kg_ms = mapping.get(kg)
                    kg_ms = parse_kg_ms(kg_ms)
                    body_kg_ms.append((row[0], kg_ms))
            l['body_kg_ms'] = body_kg_ms
            
            fopen_l.write(f'{json.dumps(l)}\n')

9162it [00:02, 3564.52it/s]


In [26]:
with open('kg-paragraph-wikipedia.translated.jsonl', 'w') as fopen_l:

    with open(os.path.join('/home/husein/ssd3/kg', 'kg-paragraph-wikipedia.jsonl')) as fopen:
        for l in tqdm(fopen):
            l = json.loads(l)
            paragraph_kg_ms = []
            for row in l['paragraph_kg']:
                if row[1]:
                    triples = []
                    for t in row[1]['triplets']:
                        triples.append([t['subject'], t['predicate'], t['object']])
                    kg = rebel_format(triples).strip()
                    kg_ms = mapping.get(kg)
                    kg_ms = parse_kg_ms(kg_ms)
                    paragraph_kg_ms.append((row[0], kg_ms))
            l['paragraph_kg_ms'] = paragraph_kg_ms

            fopen_l.write(f'{json.dumps(l)}\n')

25032it [00:02, 8514.97it/s]


In [27]:
!head -n 3 kg-paragraph-wikipedia.translated.jsonl

{"paragraph": ["Sir Francis Owen Garbatt Williams CBE (lahir 16 April 1942) ialah pengasas dan ketua pasukan bagi pasukan Formula Satu WilliamsF1."], "paragraph_kg": [["Sir Francis Owen Garbatt Williams CBE (lahir 16 April 1942) ialah pengasas dan ketua pasukan bagi pasukan Formula Satu WilliamsF1.", null]], "paragraph_kg_ms": []}
{"paragraph": ["Menurut bancian India pada tahun 2001, Remuna memiliki bilangan penduduk seramai 28,958. Dari jumlah tersebut, kaum lelaki membentuk 52% dari jumlah keseluruhan populasi dan wanita seramai 48%. ", "Menurut bancian India pada tahun 2001 Remuna memiliki kadar pendidikan (kebolehan membaca) 61%, melebihi kadar purata kebangsaan 59.5%; dengan 59% lelaki dan 41% wanita mampu membaca. 14% dari populasi berusia di bawah 6 tahun. (2001)"], "paragraph_kg": [["Menurut bancian India pada tahun 2001, Remuna memiliki bilangan penduduk seramai 28,958. Dari jumlah tersebut, kaum lelaki membentuk 52% dari jumlah keseluruhan populasi dan wanita seramai 48%. "