In [5]:
import json
import random
import requests 
import pandas as pd

Data Generation from 2WikiMultiHop

In [3]:


def download_url(url, save_path, chunk_size=128):
    r = requests.get(url, stream=True)
    with open(save_path, 'wb') as fd:
        for chunk in r.iter_content(chunk_size=chunk_size):
            fd.write(chunk)

download_url(url="https://www.dropbox.com/s/ms2m13252h6xubs/data_ids_april7.zip", save_path="/Users/aswathyajith/dev/research/mech-int/data/2WMHdata.zip")

In [6]:
file_name = "../data/raw/2WikiMultiHop/dev.json"
with open(file_name, 'r') as f:
    dev_2wmh = json.load(f)

    # get a random 100 sample of compositional type questions
    comp_dev = [qn for qn in dev_2wmh if qn['type']=='compositional']
    random.seed(42)
    comp_100 = random.sample(comp_dev, 100)

In [27]:
# Function for verbalizing the fact triple
def verbalize_triples(triple1, triple2, explicit=True):
    s1, r1, o1 = triple1
    triple1_phrase = f"the {r1} of {s1}"

    s2, r2, o2 = triple2
    if not explicit:
        s2 = triple1_phrase
        
    triple2_phrase = {
        "inception": f"The {r2} of {s2} was in", 
        "founded by": f"The {s2} was {r2}",
        "educated at": f"{s2} was {r2}", 
        "has part": f"{o2} is a part of {s2}.",
        "award received": f"{s2} received the",
        "default": f"The {r2} of {s2} is"

    }
    if r2 not in triple2_phrase:
        r2 = 'default'
    
    sent = triple2_phrase[r2]
    sent = sent[0].upper() + sent[1:] # Capitalize first letter 
    return sent

In [30]:
# comp_dev[0]['evidences']
data = []

# construct dataset of question, answer, triples
for instance in comp_100:
    question = instance['question']
    fact_1 = instance['evidences'][0]
    fact_2 = instance['evidences'][1]
    explicit = verbalize_triples(fact_1, fact_2, True)
    obscure = verbalize_triples(fact_1, fact_2, False)
    answer = instance['answer']

    instance_dict = {
        'explicit_sent': explicit, 
        'obscure_sent': obscure,
        'explicit_entity': fact_2[0], 
        'obscure_entity': f"the {fact_1[1]} of {fact_1[0]}",
        'fact1': fact_1, 
        'fact2': fact_2,
        'answer': answer
    }
    
    data.append(instance_dict)

df = pd.DataFrame(data)

In [31]:
df.to_csv("../data/multi_hop_100.csv")