In [74]:
import pandas as pd
import numpy as np
from utils.graph import KGraphPreproc
import networkx as nx

In [66]:
mqa_graph = KGraphPreproc.get_metaqa_graph()

In [67]:
import re
regex_mqa_topic_entity = re.compile("\[(.*?)\]")
def extract_mqa_topic_entity(question):
    return regex_mqa_topic_entity.findall(question)[0]

In [68]:
mqa = pd.DataFrame(columns=["id", "hop", "Question", "topic_entity", "Answer"])
mqa

Unnamed: 0,id,hop,Question,topic_entity,Answer


In [69]:
hops = [1,2,3]
for hop in hops:
    test_set_path = f"/datasets/MetaQA/{hop}hop/qa_test.txt"
    tset = pd.read_csv(test_set_path, sep="\t", header=None)
    tset.rename(columns={0: "Question", 1: "Answer"}, inplace=True)
    tset["topic_entity"] = tset.Question.apply(extract_mqa_topic_entity)
    tset.Answer = tset.apply(lambda t: set(t.Answer.lower().split("|")), axis=1)
    tset = tset.explode("Answer")
    tset.Answer = tset.Answer.apply(preprocess_text)
    tset["id"] = tset.apply(lambda r: f"{hop}-{r.name}", axis=1)
    tset["hop"] = hop
    mqa = pd.concat([mqa, tset])
mqa.dropna(inplace=True)
print("Total unique question-answer pairs:", len(mqa))

Total unique question-answer pairs: 302779


In [70]:
print(
    "Question with topic entities in graph:",
    sum(mqa["topic_entity"].apply(lambda e: mqa_graph.has_node(mqa_graph.name2mid[e])))
)
print(
    "Question with answer entities in graph:",
    sum(mqa["Answer"].apply(lambda e: mqa_graph.has_node(mqa_graph.preprocessed_nodes[e])))
)

Question with topic entities in graph: 302779
Question with answer entities in graph: 302779


In [71]:
print(
    "Questions with available paths:",
    sum(mqa.apply(
        lambda r: nx.has_path(
            mqa_graph._graph,
            mqa_graph.name2mid[r.topic_entity],
            mqa_graph.preprocessed_nodes[r.Answer]),
        axis=1
    ))
)

Questions with available paths: 302779


In [76]:
mqa["path"] = mqa.apply(
    lambda r: nx.shortest_path(
        mqa_graph._graph,
        mqa_graph.name2mid[r["topic_entity"]],
        mqa_graph.preprocessed_nodes[r.Answer]
    ),
    axis=1
)

In [77]:
mqa

Unnamed: 0,id,hop,Question,topic_entity,Answer,path
0,1-0,1,what does [Grégoire Colin] appear in,Grégoire Colin,befor the rain,"[176, 173, 16, 540, 32794]"
1,1-1,1,[Joe Thomas] appears in which movies,Joe Thomas,the inbetween 2,"[20874, 20869]"
1,1-1,1,[Joe Thomas] appears in which movies,Joe Thomas,the inbetween movi,"[20874, 40449]"
2,1-2,1,what films did [Michelle Trachtenberg] star in,Michelle Trachtenberg,the scribbler,"[17337, 21532]"
2,1-2,1,what films did [Michelle Trachtenberg] star in,Michelle Trachtenberg,inspector gadget,"[17337, 42379]"
...,...,...,...,...,...,...
14272,3-14272,3,the films written by the writer of [A Personal...,A Personal Journey with Martin Scorsese Throug...,sharon stone,"[10070, 1170, 27209, 4312]"
14272,3-14272,3,the films written by the writer of [A Personal...,A Personal Journey with Martin Scorsese Throug...,robert de niro,"[10070, 1170, 1169, 1174]"
14272,3-14272,3,the films written by the writer of [A Personal...,A Personal Journey with Martin Scorsese Throug...,lionel atwil,"[10070, 1170, 41440, 5479]"
14272,3-14272,3,the films written by the writer of [A Personal...,A Personal Journey with Martin Scorsese Throug...,winona ryder,"[10070, 1170, 41440, 2607, 2604, 2610]"


In [80]:
mqa.drop_duplicates(["Question", "topic_entity", "Answer"], inplace=True)

In [81]:
mqa.head()

Unnamed: 0,id,hop,Question,topic_entity,Answer,path
0,1-0,1,what does [Grégoire Colin] appear in,Grégoire Colin,befor the rain,"[176, 173, 16, 540, 32794]"
1,1-1,1,[Joe Thomas] appears in which movies,Joe Thomas,the inbetween 2,"[20874, 20869]"
1,1-1,1,[Joe Thomas] appears in which movies,Joe Thomas,the inbetween movi,"[20874, 40449]"
2,1-2,1,what films did [Michelle Trachtenberg] star in,Michelle Trachtenberg,the scribbler,"[17337, 21532]"
2,1-2,1,what films did [Michelle Trachtenberg] star in,Michelle Trachtenberg,inspector gadget,"[17337, 42379]"


In [83]:
len(mqa)

302522

## Create a reproducible sample of a 1000 questions in each category

In [85]:
hops = [1,2,3]
for hop in hops:
    mqa = pd.read_csv(f"/datasets/MetaQA/{hop}hop/qa_test.txt", sep="\t", header=None)
    np.random.seed(42)
    mqa = mqa.sample(n=1000)
    mqa.to_csv(f"/datasets/MetaQA/{hop}hop/test_1000.txt", header=False)