In [33]:
import json

with open("./docred_rand_200.json", 'r') as file:
    docred = json.load(file)
    
with open("./wiki20m_rand_500.json", 'r') as file:
    wiki20m = json.load(file)
    
with open("./wiki80_rand_800.json", 'r') as file:
    wiki80 = json.load(file)
    
original_rel_set = set()

for dataset in [docred, wiki20m, wiki80]:
    for text in dataset.keys():
        triples = dataset[text]
        for triple in triples:
            relation = triple[2]
            original_rel_set.add(relation)

In [35]:
wiki_relation_mapping  = {
     'contains administrative territorial entity': 'contains',
     'languages spoken, written or signed': 'language',
     'language of work or name': 'language',
     'located in the administrative territorial entity': 'located in',
     'original language of film or TV show': 'original language',
     'position played on team / speciality': 'position played',
     'sports season of league or competition': 'sports season',
}

### CDR

In [19]:
import json

with open('./cdr_rand_200.json', 'r') as f:
    cdr = json.load(f)
    
import gzip

with gzip.open('./mesh.json.gz', 'rb') as file:
    mesh_id2term = json.load(file)

In [20]:
from collections import defaultdict

cdr_processed = defaultdict(list)

for text in cdr.keys():
    triple_list = cdr[text]
    for triple in triple_list:
        relation =  triple[3]
        if relation == '1:CID:2':
            if triple[0] not in mesh_id2term or triple[1] not in mesh_id2term:
                continue
            term_1, term_2 = mesh_id2term[triple[0]], mesh_id2term[triple[1]]
            new_triple = [term_2, 'induced by', term_1]
            cdr_processed[text].append(new_triple)

In [21]:
with open('./processed/cdr_processed.json', 'w') as file:
    json.dump(cdr_processed, file, indent=4)

### DocRED

In [22]:
import json

with open("./docred_rand_200.json", 'r') as file:
    docred = json.load(file)

In [23]:
from collections import defaultdict

docred_process = defaultdict(list)

for text in docred.keys():
    triples = docred[text]
    for triple in triples:
        relation = triple[2]
        if relation in wiki_relation_mapping:
            relation = wiki_relation_mapping[relation]
        new_triple = [triple[0], relation, triple[1]]
        docred_process[text].append(new_triple)
        
with open('./processed/docred_processed.json', 'w') as file:
    json.dump(docred_process, file, indent=4)

### NYT10m

In [24]:
import json

with open("./nyt10m_rand_500.json", 'r') as file:
    nyt10m = json.load(file)

In [25]:
from collections import defaultdict

nyt10m_processed = defaultdict(list)

for text in nyt10m.keys():
    triples = nyt10m[text]
    for triple in triples:
        relation = triple[2].split('/')[-1].replace('_', ' ')
        new_triple = [triple[0], relation, triple[1]]
        nyt10m_processed[text].append(new_triple)

with open('./processed/nyt10m_processed.json', 'w') as file:
    json.dump(nyt10m_processed, file, indent=4)

### Wiki20m

In [26]:
import json

with open("./wiki20m_rand_500.json", 'r') as file:
    wiki20m = json.load(file)

In [27]:
from collections import defaultdict

wiki20m_processed = defaultdict(list)

for text in wiki20m.keys():
    triples = wiki20m[text]
    for triple in triples:
        term_1, term_2, relation = triple
        # if [term_2, relation, term_1] in wiki20m_processed[text]:
        #     continue
        if relation in wiki_relation_mapping:
            relation = wiki_relation_mapping[relation]
        new_triple = [term_1, relation, term_2]
        wiki20m_processed[text].append(new_triple)


with open('./processed/wiki20m_processed.json', 'w') as file:
    json.dump(wiki20m_processed, file, indent=4)
        
        


### TACRED

In [28]:
import json

with open("./tacred_rand_800.json", 'r') as file:
    tacred = json.load(file)

In [29]:
from collections import defaultdict

tacred_processed = defaultdict(list)

for text in tacred.keys():
    triples = tacred[text]
    for triple in triples:
        relation = triple[2]
        if relation == 'NA':
            continue
        else:
            relation = triple[2].split(':')[1].split('/')[-1]
        new_triple = [triple[0], relation, triple[1]]
        tacred_processed[text].append(new_triple)

In [30]:
with open('./processed/tacred_processed.json', 'w') as file:
    json.dump(tacred_processed, file, indent=4)

### Wiki80

In [31]:
import json

with open("./wiki80_rand_800.json", 'r') as file:
    wiki80 = json.load(file)

In [32]:
from collections import defaultdict

wiki80_processed = defaultdict(list)

for text in wiki80.keys():
    triples = wiki80[text]
    for triple in triples:
        relation = triple[2]
        if relation in wiki_relation_mapping:
            relation = wiki_relation_mapping[relation]
        new_triple = [triple[0], relation, triple[1]]
        wiki80_processed[text].append(new_triple)
        
with open('./processed/wiki80_processed.json', 'w') as file:
    json.dump(wiki80_processed, file, indent=4)