In [1]:
import json
import random
import pandas as pd
from tqdm import tqdm

# Generate questions for DPR inference

In [3]:
with open("../data/rawdata/train_dataset.json") as f:
    train_data = json.load(f)
with open("../data/rawdata/dev_dataset.json") as f:
    dev_data = json.load(f)
with open("../data/open_setting_data/dev_data_shared_entities_ranked.json") as f:
    dev_open_data = json.load(f)
with open("../data/rawdata/test_dataset_closed.json") as f:
    test_data = json.load(f)
with open("../data/rawdata/train_evi.json") as f:
    train_evi = json.load(f)
with open("../data/rawdata/dev_evi.json") as f:
    dev_evi = json.load(f)

In [2]:
with open("../data/open_setting_data/test_data_shared_entities_ranked.json") as f:
    test_open_data = json.load(f)

In [3]:
question_template = {
    "naive": "what is the relation between {head} and {tail}?"
}

# Wikidata query

In [4]:
import requests

def get_entity_name(query):
    API_ENDPOINT = "https://www.wikidata.org/w/api.php"
    params = {
        'action': 'wbsearchentities',
        'type': None if "Q" in query else 'property',
        'format': 'json',
        'language': 'en',
        'search': query
    }
    r = requests.get(API_ENDPOINT, params = params)
    return r.json()['search']

In [5]:
entities = set()
for idx, each in enumerate(train_data):
    h, t = each[0].split("#")
    entities.add(h)
    entities.add(t)
for idx, each in enumerate(dev_data):
    h, t = each[0].split("#")
    entities.add(h)
    entities.add(t)
for idx, each in enumerate(test_data):
    entities.add(each['h_id'])
    entities.add(each['t_id'])   

In [6]:
entities2name = {}
ood = []
for each in tqdm(entities):
    res = get_entity_name(each)
    if len(res) > 0 and 'label' in res[0]:
        entities2name[each] = res[0]['label']
    else:
        ood.append(each)

100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 11971/11971 [46:25<00:00,  4.30it/s]


In [7]:
ood

['Q61746295',
 'Q1979',
 'Q962190',
 'Q23306030',
 'Q1974',
 'Q25349812',
 'Q51590545',
 'Q311555',
 'Q3030436',
 'Q7125563',
 'Q57422063',
 'Q18358526',
 'Q42888166',
 'Q18528831',
 'Q75932823',
 'Q6214685',
 'Q58652618',
 'Q27150222',
 'Q1028086',
 'Q2534695',
 'Q5967378',
 'Q5146134']

In [37]:
entities2name['Q18528831'] = 'Macleans of Duart'
entities2name['Q1974'] = 'British Columbia'
entities2name['Q6214685'] = 'Jogamaya Devi College'
entities2name['Q962190'] = 'Eadburh'
entities2name['Q2534695'] = 'boxwood'
entities2name['Q1028086'] = 'Khanbaliq'
entities2name['Q61746295'] = 'Nickol Bay'
entities2name['Q57422063'] = 'Plautia'
entities2name['Q3030436'] = 'Disney Interactive'
entities2name['Q75932823'] = 'Eleanor le Despenser'
entities2name['Q27150222'] = 'fair ground'
entities2name['Q5967378'] = 'speculative fiction'

In [11]:
ood_info = requests.get(
    "https://www.wikidata.org/w/api.php?action=wbgetentities&ids=" + "|".join(ood) + "&format=json&language=en"
).json()['entities']

In [45]:
for key, info in ood_info.items():
    if 'labels' in info:
        entities2name[key] = info['labels']['en']['value']
    else:
        print(key, info)

Q25349812 {'id': 'Q25349812', 'missing': ''}
Q7125563 {'id': 'Q7125563', 'missing': ''}
Q18358526 {'id': 'Q18358526', 'missing': ''}
Q42888166 {'id': 'Q42888166', 'missing': ''}


In [46]:
entities2name['Q25349812'] = 'Sanctuary Wood'
entities2name['Q7125563'] = 'Pakistan Democratic Party'
entities2name['Q18358526'] = 'Ossama Youssef'
entities2name['Q42888166'] = 'Target Travel'

In [5]:
with open("../data/q2t.json") as f:
    q2t = json.load(f)

In [43]:
q2t['Q42888166']

{'Home Park': 4, 'Stagecoach South West': 4, 'Plympton': 4}

In [30]:
import redis
redisd = redis.Redis(host='localhost', port=6379, decode_responses=True)

In [44]:
data = json.loads(redisd.get('codred-doc-Home Park'))
for ent in data['entities']:
    if 'Q' in ent and ent['Q'] == 42888166:
        print(ent['name'])

Target Travel


In [47]:
len(entities), len(entities2name)

(11971, 11971)

In [48]:
with open("../data/q2name.json", "w") as f:
    json.dump(entities2name, f)

In [55]:
relation2name = {}
with open("../data/rawdata/relations.json") as f:
    relations = json.load(f)
for r in relations:
    relation2name[r] = get_entity_name(r)[0]['label']

In [57]:
with open("../data/r2name.json", "w") as f:
    json.dump(relation2name, f)

# Generate questions

In [4]:
with open("../data/q2name.json") as f:
    entities2name = json.load(f)
with open("../data/r2name.json") as f:
    relation2name = json.load(f)

In [5]:
codred_question_open_test_dataset = []
pbar = tqdm(total=len(test_open_data))
for idx, each in enumerate(test_open_data):
    h, t = each[0].split("#")
    h_name = entities2name[h]
    t_name = entities2name[t]
    
    r = each[3]
    if r == 'n/a':
        r_name = 'not available'
    else:
        r_name = relation2name[r]
    record = {
        "question": question_template['naive'].format(head=h_name, tail=t_name),
        "answers": [r_name],
        "id": idx
    }
    codred_question_open_test_dataset.append(record)
    pbar.update(1)

100%|██████████| 77940/77940 [00:19<00:00, 230285.43it/s]

In [6]:
with open("../data/DPR/inference/codred_questions_open_test_dataset.jsonl", "w") as f:
    for each in codred_question_open_test_dataset:
        f.write(json.dumps(each) + "\n")

In [10]:
codred_question_open_dev_dataset = []
pbar = tqdm(total=len(dev_open_data))
for idx, each in enumerate(dev_open_data):
    h, t = each[0].split("#")
    h_name = entities2name[h]
    t_name = entities2name[t]
    
    r = each[3]
    if r == 'n/a':
        r_name = 'not available'
    else:
        r_name = relation2name[r]
    record = {
        "question": question_template['naive'].format(head=h_name, tail=t_name),
        "answers": [r_name],
        "id": idx
    }
    codred_question_open_dev_dataset.append(record)
    pbar.update(1)


100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 78023/78023 [00:26<00:00, 2948.00it/s][A

 33%|████████████████████████████████████████████████████▎                                                                                                        | 25972/78023 [00:00<00:00, 259715.91it/s][A
 83%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                          | 64879/78023 [00:00<00:00, 335805.78it/s][A

In [12]:
with open("../data/DPR/inference/codred_questions_open_dev_dataset.jsonl", "w") as f:
    for each in codred_question_open_dev_dataset:
        f.write(json.dumps(each) + "\n")

In [8]:
codred_question_train_dataset = []
pbar = tqdm(total=len(train_data))
for idx, each in enumerate(train_data):
    h, t = each[0].split("#")
    h_name = entities2name[h]
    t_name = entities2name[t]
    
    r = each[3]
    if r == 'n/a':
        r_name = 'not available'
    else:
        r_name = relation2name[r]
    record = {
        "question": question_template['naive'].format(head=h_name, tail=t_name),
        "answers": [r_name],
        "id": idx
    }
    codred_question_train_dataset.append(record)
    pbar.update(1)

 78%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                                 | 101562/129548 [00:00<00:00, 146543.25it/s]

In [9]:
with open("../data/DPR/inference/codred_questions_train_dataset.jsonl", "w") as f:
    for each in codred_question_train_dataset:
        f.write(json.dumps(each) + "\n")

In [10]:
codred_question_dev_dataset = []
pbar = tqdm(total=len(dev_data))
for idx, each in enumerate(dev_data):
    h, t = each[0].split("#")
    h_name = entities2name[h]
    t_name = entities2name[t]
    
    r = each[3]
    if r == 'n/a':
        r_name = 'not available'
    else:
        r_name = relation2name[r]
    record = {
        "question": question_template['naive'].format(head=h_name, tail=t_name),
        "answers": [r_name],
        "id": idx
    }
    codred_question_dev_dataset.append(record)
    pbar.update(1)


100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 129548/129548 [00:06<00:00, 19559.87it/s][A

 69%|████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                                                | 28091/40740 [00:00<00:00, 280903.57it/s][A

In [11]:
with open("../data/DPR/inference/codred_questions_dev_dataset.jsonl", "w") as f:
    for each in codred_question_dev_dataset:
        f.write(json.dumps(each) + "\n")

In [13]:
codred_question_test_dataset = []
pbar = tqdm(total=len(test_data))
for idx, each in enumerate(test_data):
    h, t = each[0].split("#")
    h_name = entities2name[h]
    t_name = entities2name[t]
    
    r_name = 'unk'
    record = {
        "question": question_template['naive'].format(head=h_name, tail=t_name),
        "answers": [r_name],
        "id": idx
    }
    codred_question_test_dataset.append(record)
    pbar.update(1)


  0%|                                                                                                                                                                             | 0/40524 [00:12<?, ?it/s][A

 52%|█████████████████████████████████████████████████████████████████████████████████▋                                                                           | 21077/40524 [00:00<00:00, 210768.19it/s][A

In [14]:
with open("../data/DPR/inference/codred_questions_test_dataset.jsonl", "w") as f:
    for each in codred_question_test_dataset:
        f.write(json.dumps(each) + "\n")


100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 40524/40524 [00:16<00:00, 210768.19it/s][A

# Generate DPR finetune data

In [4]:
with open("../data/rawdata/train_evi.json") as f:
    train_evi = json.load(f)
with open("../data/rawdata/dev_evi.json") as f:
    dev_evi = json.load(f)

In [5]:
train_evi[0]

{'h': 'Luxeuil Abbey',
 't': 'Catholic Encyclopedia',
 'r': 'P1343',
 'doc_h': 'Columbanus',
 'doc_t': 'Quapaw',
 'evis_h': [[0, 0]],
 'evis_t': [[22, 0], [26, 0]],
 'id': 29784012,
 'key': 'Q1232116#Q302556'}

In [6]:
passages = pd.read_csv("../data/codred_passages.tsv", sep="\t")

In [7]:
passage_dict = {}
passages.set_index("id", inplace=True)
print(passages)
for idx, (text, title) in tqdm(passages.iterrows()):
    passage_dict[idx] = (title, text)

                                                         text          title
id                                                                          
12_0        Anarchism is a radical political movement that...      Anarchism
12_1        Anarchism 's timeline stretches back to prehis...      Anarchism
12_2        Anarchism employ various tactics in order to m...      Anarchism
12_3        The etymological origin of the word `` anarchi...      Anarchism
12_4        The first political philosopher to call himsel...      Anarchism
...                                                       ...            ...
63097069_2  Cantor is a career member of the Senior Execut...  Carmen Cantor
63097069_3  On July 15 , 2019 , President Trump announced ...  Carmen Cantor
63097069_4  On October 16 , 2019 , she appeared before the...  Carmen Cantor
63097069_5  Cantor is fluent in Spanish . She is married t...  Carmen Cantor
63149555_0                       Safiye Sultan may refer to :  Safiye Sultan

5193458it [03:18, 26229.47it/s]


In [8]:
with open("../data/title2id.json") as f:
    title2id = json.load(f)

In [9]:
with open("../data/r2name.json") as f:
    r2name = json.load(f)

In [10]:
doc2passage = {}
for k in passage_dict.keys():
    doc_id = k.split("_")[0]
    if doc_id not in doc2passage:
        doc2passage[doc_id] = []
    doc2passage[doc_id].append(k)

In [11]:
def transform(data, sample_size=5):
    outputs = []
    for sample in tqdm(data):
        output = {}
        
        output["question"] = question_template["naive"].format(head=sample["h"], tail=sample["t"])
        output["answers"] = [r2name[sample["r"]]]
    
        output["positive_ctxs"] = []
        pos_set = set()
        for (pid, _) in sample["evis_h"]:
            if pid == -1:
                output["positive_ctxs"].append({
                    "title": sample['doc_h'],
                    "text": sample['doc_h']
                })
            else:
                lid = f"{title2id[sample['doc_h']]}_{pid}"
                pos_set.add(lid)
                title, text = passage_dict[lid]
                output["positive_ctxs"].append({
                    "title": title,
                    "text": text
                })
        for (pid, _) in sample["evis_t"]:
            if pid == -1:
                output["positive_ctxs"].append({
                    "title": sample['doc_t'],
                    "text": sample['doc_t']
                })
            else:
                lid = f"{title2id[sample['doc_t']]}_{pid}"
                pos_set.add(lid)
                title, text = passage_dict[lid]
                output["positive_ctxs"].append({
                    "title": title,
                    "text": text
                })
    
        p_h = doc2passage[title2id[sample['doc_h']]]
        p_t = doc2passage[title2id[sample['doc_t']]]
        neg_set = set(random.sample(p_h, min(sample_size, len(p_h))) +\
            random.sample(p_t, min(sample_size, len(p_t))))
        neg_set = neg_set.difference(pos_set)
    
        output["negative_ctxs"] = []
        for lid in neg_set:
            title, text = passage_dict[lid]
            output["negative_ctxs"].append({
                "title": title,
                "text": text
            })

        output["hard_negative_ctxs"] = []
        outputs.append(output)
        
        for pos_ctx in output['positive_ctxs']:
            augmented_question = output['question'] + " " + pos_ctx['text']
            augmented_output = {
                'question': augmented_question,
                'positive_ctxs': [each for each in output['positive_ctxs'] if each != pos_ctx]
            }
            
            p_h = doc2passage[title2id[sample['doc_h']]]
            p_t = doc2passage[title2id[sample['doc_t']]]
            neg_set = set(random.sample(p_h, min(sample_size, len(p_h))) +\
                random.sample(p_t, min(sample_size, len(p_t))))
            neg_set = neg_set.difference(pos_set)
    
            augmented_output["negative_ctxs"] = []
            for lid in neg_set:
                title, text = passage_dict[lid]
                augmented_output["negative_ctxs"].append({
                    "title": title,
                    "text": text
                })
            outputs.append(augmented_output)
    return outputs

In [12]:
train_outputs = transform(train_evi)
dev_outputs = transform(dev_evi)

100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 12013/12013 [00:01<00:00, 6204.73it/s]
100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 3497/3497 [00:00<00:00, 4792.96it/s]


In [13]:
with open("../data/DPR/finetune/train.json", "w") as f:
    json.dump(train_outputs, f)
with open("../data/DPR/finetune/dev.json", "w") as f:
    json.dump(dev_outputs, f)