In [33]:
import json

with open("./docred_rand_200.json", 'r') as file:
    docred = json.load(file)
    
with open("./wiki20m_rand_500.json", 'r') as file:
    wiki20m = json.load(file)
    
with open("./wiki80_rand_800.json", 'r') as file:
    wiki80 = json.load(file)
    
original_rel_set = set()

for dataset in [docred, wiki20m, wiki80]:
    for text in dataset.keys():
        triples = dataset[text]
        for triple in triples:
            relation = triple[2]
            original_rel_set.add(relation)

In [35]:
wiki_relation_mapping  = {
     'contains administrative territorial entity': 'contains',
     'languages spoken, written or signed': 'language',
     'language of work or name': 'language',
     'located in the administrative territorial entity': 'located in',
     'original language of film or TV show': 'original language',
     'position played on team / speciality': 'position played',
     'sports season of league or competition': 'sports season',
}

### CDR

In [6]:
import json

with open('./cdr_rand_200.json', 'r') as f:
    cdr = json.load(f)
    
import gzip

id2name = json.load(open('../datasets/cdr/id2name.json', 'r'))

In [9]:
from collections import defaultdict

cdr_processed = defaultdict(list)

for text in cdr.keys():
    triple_list = cdr[text]
    for triple in triple_list:
        relation =  triple[3]
        if relation == '1:CID:2':
            if triple[0] not in id2name or triple[1] not in id2name:
                continue
            term_1, term_2 = id2name[triple[0]], id2name[triple[1]]
            new_triple = [term_2, 'induced by', term_1]
            cdr_processed[text].append(new_triple)
        elif relation == '1:NR:2':
            if triple[0] not in id2name or triple[1] not in id2name:
                continue
            term_1, term_2 = id2name[triple[0]], id2name[triple[1]]
            new_triple = [term_2, 'not induced by', term_1]
            cdr_processed[text].append(new_triple)

In [10]:
with open('./processed/cdr_processed.json', 'w') as file:
    json.dump(cdr_processed, file, indent=4)

### DocRED

In [22]:
import json

with open("./docred_rand_200.json", 'r') as file:
    docred = json.load(file)

In [23]:
from collections import defaultdict

docred_process = defaultdict(list)

for text in docred.keys():
    triples = docred[text]
    for triple in triples:
        relation = triple[2]
        if relation in wiki_relation_mapping:
            relation = wiki_relation_mapping[relation]
        new_triple = [triple[0], relation, triple[1]]
        docred_process[text].append(new_triple)
        
with open('./processed/docred_processed.json', 'w') as file:
    json.dump(docred_process, file, indent=4)

### NYT10m

In [24]:
import json

with open("./nyt10m_rand_500.json", 'r') as file:
    nyt10m = json.load(file)

In [25]:
from collections import defaultdict

nyt10m_processed = defaultdict(list)

for text in nyt10m.keys():
    triples = nyt10m[text]
    for triple in triples:
        relation = triple[2].split('/')[-1].replace('_', ' ')
        new_triple = [triple[0], relation, triple[1]]
        nyt10m_processed[text].append(new_triple)

with open('./processed/nyt10m_processed.json', 'w') as file:
    json.dump(nyt10m_processed, file, indent=4)

### Wiki20m

In [26]:
import json

with open("./wiki20m_rand_500.json", 'r') as file:
    wiki20m = json.load(file)

In [27]:
from collections import defaultdict

wiki20m_processed = defaultdict(list)

for text in wiki20m.keys():
    triples = wiki20m[text]
    for triple in triples:
        term_1, term_2, relation = triple
        # if [term_2, relation, term_1] in wiki20m_processed[text]:
        #     continue
        if relation in wiki_relation_mapping:
            relation = wiki_relation_mapping[relation]
        new_triple = [term_1, relation, term_2]
        wiki20m_processed[text].append(new_triple)


with open('./processed/wiki20m_processed.json', 'w') as file:
    json.dump(wiki20m_processed, file, indent=4)
        
        


### TACRED

In [28]:
import json

with open("./tacred_rand_800.json", 'r') as file:
    tacred = json.load(file)

In [29]:
from collections import defaultdict

tacred_processed = defaultdict(list)

for text in tacred.keys():
    triples = tacred[text]
    for triple in triples:
        relation = triple[2]
        if relation == 'NA':
            continue
        else:
            relation = triple[2].split(':')[1].split('/')[-1]
        new_triple = [triple[0], relation, triple[1]]
        tacred_processed[text].append(new_triple)

In [30]:
with open('./processed/tacred_processed.json', 'w') as file:
    json.dump(tacred_processed, file, indent=4)

### Wiki80

In [31]:
import json

with open("./wiki80_rand_800.json", 'r') as file:
    wiki80 = json.load(file)

In [32]:
from collections import defaultdict

wiki80_processed = defaultdict(list)

for text in wiki80.keys():
    triples = wiki80[text]
    for triple in triples:
        relation = triple[2]
        if relation in wiki_relation_mapping:
            relation = wiki_relation_mapping[relation]
        new_triple = [triple[0], relation, triple[1]]
        wiki80_processed[text].append(new_triple)
        
with open('./processed/wiki80_processed.json', 'w') as file:
    json.dump(wiki80_processed, file, indent=4)

### add emb

In [11]:
from openai_emb import embedding_retriever
from tqdm import tqdm

with open('/data/pj20/gre_element_embedding_dict.json', 'r') as f:
    element_embedding_dict = json.load(f)
    
with open('./processed/cdr_processed.json', 'r') as file:
    cdr = json.load(file)
    
for text in tqdm(cdr.keys()):
    triples = cdr[text]
    for triple in triples:
        term_1, relation, term_2 = triple
        if term_1 not in element_embedding_dict:
            element_embedding_dict[term_1] = embedding_retriever(term_1)
        if term_2 not in element_embedding_dict:
            element_embedding_dict[term_2] = embedding_retriever(term_2)
        if relation not in element_embedding_dict:
            element_embedding_dict[relation] = embedding_retriever(relation)
        
with open('/data/pj20/gre_element_embedding_dict.json', 'w') as f:
    json.dump(element_embedding_dict, f, indent=4)

100%|██████████| 200/200 [01:06<00:00,  2.99it/s]


In [13]:
with open('../processed_results/cdr_rand_200_gpt-3.5_semi_1.json', 'r') as file:
    cdr = json.load(file)
    
for text in tqdm(cdr.keys()):
    triples = cdr[text]
    for triple in triples:
        term_1, relation, term_2 = triple
        if term_1 not in element_embedding_dict:
            element_embedding_dict[term_1] = embedding_retriever(term_1)
        if term_2 not in element_embedding_dict:
            element_embedding_dict[term_2] = embedding_retriever(term_2)
        if relation not in element_embedding_dict:
            element_embedding_dict[relation] = embedding_retriever(relation)
        
with open('/data/pj20/gre_element_embedding_dict.json', 'w') as f:
    json.dump(element_embedding_dict, f, indent=4)

100%|██████████| 200/200 [01:09<00:00,  2.87it/s]


### Add Embedding

In [4]:
from openai_emb import embedding_retriever
from tqdm import tqdm
import json

model_names = [
    # 'vicuna-1.3-33b', 
    # 'llama-2-70b',
    # 'gpt-3.5-turbo-1106',
    # 'gpt-4-1106-preview',
    # 'openchat',
    'gpt-3.5_closed',
    'gpt-3.5_semi'
    ]

dataset_names = [
    'cdr_rand_200',
    'nyt10m_rand_500',
]

seeds = [1]

        
# with open('/data/pj20/gre_element_embedding_dict.json', 'r') as f:
#     element_embedding_dict = json.load(f)
    
for model_name in model_names:
    for dataset_name in dataset_names:
        for seed in seeds:
            file_to_evaluate = f'../processed_results/{dataset_name}_{model_name}_{seed}.json'
            text_triples = json.load(open(file_to_evaluate, 'r'))
    
            for text in tqdm(text_triples.keys()):
                triples = text_triples[text]
                for triple in triples:
                    term_1, relation, term_2 = triple
                    if term_1 not in element_embedding_dict:
                        element_embedding_dict[term_1] = embedding_retriever(term_1)
                    if term_2 not in element_embedding_dict:
                        element_embedding_dict[term_2] = embedding_retriever(term_2)
                    if relation not in element_embedding_dict:
                        element_embedding_dict[relation] = embedding_retriever(relation)
                    
                    
with open('/data/pj20/gre_element_embedding_dict.json', 'w') as f:
    json.dump(element_embedding_dict, f, indent=4)

100%|██████████| 200/200 [00:00<00:00, 209453.38it/s]
 22%|██▏       | 108/500 [00:13<00:46,  8.50it/s]

Error in gpt_instruct: "place_of_birth, nationality, location, place_lived". Retrying...


 27%|██▋       | 136/500 [00:20<00:40,  8.95it/s]

Error in gpt_instruct: "leader of the Shiite Amal party and speaker of Lebanon's Parliament". Retrying...


 76%|███████▌  | 379/500 [01:08<00:09, 12.75it/s]

Error in gpt_instruct: "administrative_divisions". Retrying...


100%|██████████| 500/500 [02:38<00:00,  3.16it/s]
100%|██████████| 200/200 [00:00<00:00, 219597.07it/s]
 38%|███▊      | 191/500 [00:19<00:45,  6.86it/s]

Error in gpt_instruct: "Delawarean". Retrying...


 54%|█████▍    | 270/500 [00:32<00:33,  6.94it/s]

Error in gpt_instruct: "doubt". Retrying...


 73%|███████▎  | 363/500 [01:00<00:04, 31.46it/s]

Error in gpt_instruct: "position:administrative_division". Retrying...
Error in gpt_instruct: "new money". Retrying...
Error in gpt_instruct: "new money". Retrying...


 85%|████████▌ | 427/500 [03:25<00:22,  3.18it/s]

Error in gpt_instruct: "airport terminal:location". Retrying...


100%|██████████| 500/500 [03:42<00:00,  2.25it/s]
