In [16]:
import json
import numpy as np
import os 
import jsonlines
from tqdm import tqdm
from transformers import T5Tokenizer

DATA_DIR='../data'

def read_jsonl(path_to_file):
    with open(path_to_file, 'r') as f:
        data = [json.loads(line) for line in f]
    return data

def write_jsonl(path_to_file, data, mode='w'):
    with jsonlines.open(path_to_file, mode) as writer:
        writer.write_all(data)

def get_data(data_dir, split):
    data = read_jsonl(os.path.join(data_dir, split + '.jsonl'))
    return data

def get_start_index_parts(text, sf_parts):
    max_subseq_len = len(sf_parts)
    
    for l in range(max_subseq_len, 0, -1):
        for i in range(max_subseq_len - l + 1):
            subseq = ' '.join(sf_parts[i:i+l])
            try:
                return text.index(subseq)
            except ValueError:
                pass
    
    return None

def _get_start_index(dp, sf):
    for entity in dp['entities']:
        if entity['surfaceform'] == sf:
            idx = entity['mention_start_index']
    
    if idx is None:
        return float('inf')

    return idx

def _apply_ordering_heuristic_to_datapoint(dp):
    input_text = dp['text'].lower()
    entities = dp['entities']
    entity_surfaceforms_parts = [entity['surfaceform'].lower().split("_") for entity in entities]
    mention_start_index = [get_start_index_parts(input_text, sf_parts) for sf_parts in entity_surfaceforms_parts]
    for idx, entity in zip(mention_start_index, entities):
        entity['mention_start_index'] = idx

    # apply_ordering
    sub_obj_sf_pairs = [(triplet['subject']['surfaceform'], triplet['object']['surfaceform']) for triplet in dp['triplets']]
    sub_obj_start_idx_pairs = np.array([(_get_start_index(dp, sub_sf), _get_start_index(dp, obj_sf)) for sub_sf, obj_sf in sub_obj_sf_pairs]).T
    sub_ent_name = [triplet['subject']['surfaceform'] for triplet in dp['triplets']]
    ordered_indices = np.lexsort((sub_obj_start_idx_pairs[1,:], sub_ent_name, sub_obj_start_idx_pairs[0,:]))
    dp['triplets'] = [dp['triplets'][idx] for idx in ordered_indices]

def _apply_ordering_heuristic_to_data(data, verbose, num_examples_to_show):
    for dp in tqdm(data, "Locating the mention start indices."):
        _apply_ordering_heuristic_to_datapoint(dp)
    
    # report on non-resolved entities
    if verbose:
        _apply_ordering_heuristics_info(data, num_examples_to_show=num_examples_to_show)


def _apply_ordering_heuristics_info(data, num_examples_to_show):
    datapoints_affected = []
    num_triplets_affected = 0
    for dp in data:
        nan_surface_forms = set()

        for entity in dp['entities']:
            if entity['mention_start_index'] is None:
                nan_surface_forms.add(entity['surfaceform'])
        
        if len(nan_surface_forms) == 0:
            continue

        datapoints_affected.append(dp)

        if num_examples_to_show > 0:
            print('------------------')
            print('Input text:', dp['text'])
            print('Not resolved entities:', nan_surface_forms)

        for triplet in dp['triplets']:
            if triplet['subject']['surfaceform'] in nan_surface_forms or triplet['object']['surfaceform'] in nan_surface_forms:
                num_triplets_affected += 1
                if num_examples_to_show > 0:
                    print(triplet)
        
        num_examples_to_show -= 1

    # get all mention_start_indices
    mention_start_indices = np.array([entity['mention_start_index'] for dp in data for entity in dp['entities']])
    # get number and portion of NaNs
    num_nans = np.sum(mention_start_indices == None)
    portion_nans = num_nans / len(mention_start_indices)

    # get total number of triplets
    num_total_triplets = 0
    for dp in data:
        num_total_triplets += len(dp['triplets'])

    print(f"Number of affected: entities ({num_nans} -- {portion_nans:.2%}), triplets ({num_triplets_affected} -- {num_triplets_affected / num_total_triplets:.2%}), datapoints ({len(datapoints_affected)} -- {len(datapoints_affected) / len(data):.2%})")

    # # print the number and portion of affected triplets
    # print(f"Number of affected triplets: {num_triplets_affected} ({num_triplets_affected / num_triplets:.2%})")

    # # print the number and portion of affected datapoints
    # print(f"Number of affected datapoints: {len(datapoints_affected)} ({len(datapoints_affected) / len(data):.2%})")
        

def apply_ordering_heuristic(data_dir, split, output_split, verbose, num_examples_to_show, output_dir=None):
    if output_dir is None:
        output_dir = data_dir
    
    print(f"Applying ordering heuristic to the split `{split}` in `{data_dir}`...")
    data = get_data(data_dir, split)

    # process the data
    _apply_ordering_heuristic_to_data(data, verbose, num_examples_to_show)
    
    if output_split != None:
        output_file_path = os.path.join(data_dir, output_split + '.jsonl')
        print(f"Saving the processed data to `{output_file_path}`...")
        write_jsonl(output_file_path, data)
    
    return data


In [10]:
dataset_name='sdg'
split='_train_code_davinci'
output_split=None
# output_split='val_code_davinci_ordered'

apply_ordering_heuristic(data_dir=os.path.join(DATA_DIR, dataset_name), split=split, output_split=output_split, verbose=True, num_examples_to_show=5)

Applying ordering heuristic to the split `_train_code_davinci` in `../data/sdg`...


Locating the mention start indices.: 100%|██████████| 179255/179255 [00:03<00:00, 54673.53it/s]


------------------
Input text: "La candidata" is a Mexican telenovela produced by Televisa and broadcast by Las Estrellas. It stars Silvia Navarro.
Not resolved entities: {'Multiple-camera_setup'}
{'subject': {'surfaceform': 'La_candidata', 'uri': 'Q27613956'}, 'object': {'surfaceform': 'Multiple-camera_setup', 'uri': 'Q738160'}, 'predicate': {'surfaceform': 'camera setup', 'uri': 'P4312'}}
------------------
Input text: A telomere is a region of repetitive nucleotide sequences at each end of a chromosome, which protects the end of the chromosome from deterioration or from fusion with neighboring chromosomes. Telomeres also regulate chromosome aging. Telomeres shorten progressively with each cell division, which limits the number of divisions and thus helps set the Hayflick limit. Telomere length is regulated by two opposing mechanisms: attrition and elongation. Telomere length is maintained by two DNA polymerases: telomerase reverse transcriptase and mitochondrial DNA polymerase.
Not 

[{'id': 0,
  'entities': [{'surfaceform': 'Seventh_Doctor',
    'uri': 'Q2560380',
    'mention_start_index': 4},
   {'surfaceform': 'Gallifrey', 'uri': 'Q367221', 'mention_start_index': 40},
   {'surfaceform': 'Sylvester_McCoy',
    'uri': 'Q455551',
    'mention_start_index': 63},
   {'surfaceform': 'Scientist', 'uri': 'Q901', 'mention_start_index': 25}],
  'relations': [{'surfaceform': 'home world', 'uri': 'P1165'},
   {'surfaceform': 'performer', 'uri': 'P175'},
   {'surfaceform': 'occupation', 'uri': 'P106'}],
  'triplets': [{'subject': {'surfaceform': 'Seventh_Doctor',
     'uri': 'Q2560380'},
    'object': {'surfaceform': 'Scientist', 'uri': 'Q901'},
    'predicate': {'surfaceform': 'occupation', 'uri': 'P106'}},
   {'subject': {'surfaceform': 'Seventh_Doctor', 'uri': 'Q2560380'},
    'object': {'surfaceform': 'Gallifrey', 'uri': 'Q367221'},
    'predicate': {'surfaceform': 'home world', 'uri': 'P1165'}},
   {'subject': {'surfaceform': 'Seventh_Doctor', 'uri': 'Q2560380'},
    '

In [18]:
dataset_name='rebel'
split='val'
output_split=None
output_split='val_ordered'

d = apply_ordering_heuristic(data_dir=os.path.join(DATA_DIR, dataset_name), split=split, output_split=output_split, verbose=True, num_examples_to_show=0)

Applying ordering heuristic to the split `val` in `../data/rebel`...


Locating the mention start indices.: 100%|██████████| 54978/54978 [00:00<00:00, 101534.88it/s]


Number of affected: entities (9262 -- 5.87%), triplets (10683 -- 10.17%), datapoints (7382 -- 13.43%)
Saving the processed data to `../data/rebel/val_ordered.jsonl`...


In [12]:
dataset_name='sdg'
split='val_code_davinci'
output_split=None
output_split='val_code_davinci_ordered'

apply_ordering_heuristic(data_dir=os.path.join(DATA_DIR, dataset_name), split=split, output_split=output_split, verbose=True, num_examples_to_show=5)

Applying ordering heuristic to the split `val_code_davinci` in `../data/sdg`...


Locating the mention start indices.: 100%|██████████| 15000/15000 [00:00<00:00, 56014.02it/s]


------------------
Input text: Cajetan von Felder was a jurist and a member of the German National Academy of Sciences Leopoldina and the Austrian Academy of Sciences.
Not resolved entities: {'Allgemeine_Deutsche_Biographie'}
{'subject': {'surfaceform': 'Cajetan_von_Felder', 'uri': 'Q79174'}, 'object': {'surfaceform': 'Allgemeine_Deutsche_Biographie', 'uri': 'Q590208'}, 'predicate': {'surfaceform': 'described by source', 'uri': 'P1343'}}
------------------
Input text: The 2011 census was conducted in South Africa by Statistics South Africa. The census used a questionnaire as its primary determination method.
Not resolved entities: {'Document'}
{'subject': {'surfaceform': 'Questionnaire', 'uri': 'Q747810'}, 'object': {'surfaceform': 'Document', 'uri': 'Q49848'}, 'predicate': {'surfaceform': 'subclass of', 'uri': 'P279'}}
------------------
Input text: Shenzhou 5 was a Chinese space launch vehicle that was used to launch the Shenzhou 5 spacecraft. The launch was operated by the China Nat

[{'id': 21,
  'entities': [{'surfaceform': 'Swedish_Open_Cultural_Heritage',
    'uri': 'Q7654799',
    'mention_start_index': 0},
   {'surfaceform': 'XML', 'uri': 'Q2115', 'mention_start_index': 64},
   {'surfaceform': 'Semantic_Web',
    'uri': 'Q54837',
    'mention_start_index': 141},
   {'surfaceform': 'Java_(programming_language)',
    'uri': 'Q251',
    'mention_start_index': 89},
   {'surfaceform': 'Free_software', 'uri': 'Q341', 'mention_start_index': 36}],
  'relations': [{'surfaceform': 'file format', 'uri': 'P2701'},
   {'surfaceform': 'part of', 'uri': 'P361'},
   {'surfaceform': 'programming language', 'uri': 'P277'},
   {'surfaceform': 'instance of', 'uri': 'P31'}],
  'triplets': [{'subject': {'surfaceform': 'Swedish_Open_Cultural_Heritage',
     'uri': 'Q7654799'},
    'object': {'surfaceform': 'Free_software', 'uri': 'Q341'},
    'predicate': {'surfaceform': 'instance of', 'uri': 'P31'}},
   {'subject': {'surfaceform': 'Swedish_Open_Cultural_Heritage',
     'uri': 'Q76

In [15]:
# folder = "../data/sdg"
# split = "val_code_davinci_ordered"
# path_to_file = os.path.join(folder, split + '.jsonl')
# data = read_jsonl(path_to_file)

In [16]:
# dp = data[9]
# sub_obj_sf_pairs = [(triplet['subject']['surfaceform'], triplet['object']['surfaceform']) for triplet in dp['triplets']]
# sub_obj_start_idx_pairs = np.array([(_get_start_index(dp, sub_sf), _get_start_index(dp, obj_sf)) for sub_sf, obj_sf in sub_obj_sf_pairs]).T
# sub_obj_start_idx_pairs

In [17]:
# dp['triplets'], dp['text']

In [18]:
# input_text = dp['text'].lower()
# entities = dp['entities']
# entity_surfaceforms_parts = [entity['surfaceform'].lower().split("_") for entity in entities]
# mention_start_index = [get_start_index_parts(input_text, sf_parts) for sf_parts in entity_surfaceforms_parts]
# for idx, entity in zip(mention_start_index, entities):
#     entity['mention_start_index'] = idx

# # apply_ordering
# sub_obj_sf_pairs = [(triplet['subject']['surfaceform'], triplet['object']['surfaceform']) for triplet in dp['triplets']]
# sub_obj_start_idx_pairs = np.array([(_get_start_index(dp, sub_sf), _get_start_index(dp, obj_sf)) for sub_sf, obj_sf in sub_obj_sf_pairs]).T
# ordered_indices = np.lexsort((sub_obj_start_idx_pairs[0,:], sub_obj_start_idx_pairs[1,:]))
# dp['triplets'] = [dp['triplets'][idx] for idx in ordered_indices]

In [19]:
# np.lexsort((sub_obj_start_idx_pairs[1,:], sub_obj_start_idx_pairs[0,:]))