### Converts full 14k MQA results into a 1k representative sample

In [5]:
%load_ext autoreload
%autoreload 2

In [1]:
import pandas as pd
import os
from utils.graph import KGraphPreproc
from utils.evaluation import FBQA_Dataset, MetaQA_Dataset, CWQ_Dataset
import networkx as nx

In [2]:
cwq_eval = CWQ_Dataset()
fbqa_eval = FBQA_Dataset()
mqa_eval = MetaQA_Dataset()

In [25]:
cwq_eval.tabulate_performance()

CWQ
+-----------+------------+
| Method    |   Test Set |
| bline     |      0.3   |
+-----------+------------+
| kb-path   |      0.621 |
+-----------+------------+
| kb1       |      0.291 |
+-----------+------------+
| kb2       |      0.302 |
+-----------+------------+
| kb3       |      0.301 |
+-----------+------------+
| sbert-kb1 |      0.241 |
+-----------+------------+
| sbert-kb2 |      0.226 |
+-----------+------------+
| sbert-kb3 |      0.217 |
+-----------+------------+


In [11]:
fbqa_eval.tabulate_performance()

FBQA
+-----------+------------+
| Method    |   Test Set |
| bline     |      0.762 |
+-----------+------------+
| bline2    |      0.752 |
+-----------+------------+
| kb-path   |      0.795 |
+-----------+------------+
| kb1       |      0.737 |
+-----------+------------+
| kb2       |      0.745 |
+-----------+------------+
| kb3       |      0.737 |
+-----------+------------+
| sbert-kb1 |      0.511 |
+-----------+------------+
| sbert-kb2 |      0.563 |
+-----------+------------+
| sbert-kb3 |      0.559 |
+-----------+------------+


In [22]:
mqa_eval.tabulate_performance()

MetaQA
+-----------+----------+--------+--------+
| Method    |     1hop |   2hop |   3hop |
| bline     | 0.388    |  0.222 |  0.398 |
+-----------+----------+--------+--------+
| bline2    | 0.397    |  0.209 |  0.39  |
+-----------+----------+--------+--------+
| kb-path   | 0.972    |  0.974 |  0.981 |
+-----------+----------+--------+--------+
| kb1       | 0.88     |  0.166 |  0.341 |
+-----------+----------+--------+--------+
| kb2       | 0.841    |  0.739 |  0.358 |
+-----------+----------+--------+--------+
| kb3       | 0.834835 |  0.185 |  0.319 |
+-----------+----------+--------+--------+
| sbert-kb1 | 0.854    |  0.168 |  0.356 |
+-----------+----------+--------+--------+
| sbert-kb2 | 0.86     |  0.733 |  0.357 |
+-----------+----------+--------+--------+
| sbert-kb3 | 0.801    |  0.514 |  0.313 |
+-----------+----------+--------+--------+


In [65]:
mqa_eval.test_sets["1hop"].head(2)

Unnamed: 0_level_0,Question,Answers
qid,Unnamed: 1_level_1,Unnamed: 2_level_1
3710,who is the director that directed [Big Jake],"{george sherman, john wayne}"
5313,what words describe [The Front Page],"{walter matthau, jack lemmon, billy wilder}"


In [115]:
hop= "1hop"

In [125]:
mqa_eval.test_sets[hop].loc[789]

Question            what does [Walter Steiner] act in
Answers     {the great ecstasy of woodcarver steiner}
Name: 789, dtype: object

In [58]:
result_set_paths = [
        "/datasets/MetaQA/results/{hop}/full/bline.csv",
        "/datasets/MetaQA/results/{hop}/full/bline2.csv",
        "/datasets/MetaQA/results/{hop}/full/kb1.csv",
        "/datasets/MetaQA/results/{hop}/full/kb2.csv",
        "/datasets/MetaQA/results/{hop}/full/kb3.csv",
        "/datasets/MetaQA/results/{hop}/full/sbert-kb3.csv",
    ]

In [163]:
for hop in mqa_eval.hops:
    q_df = mqa_eval.test_sets[hop].copy()
    for r_set in result_set_paths:
        path = r_set.format(hop=hop)
        try:
            r_df = pd.read_csv(path)
            mg = r_df.merge(q_df, how="left", left_index=True, right_index=True).dropna()
            # mg["Model"].to_csv(path.replace("/full", ""))
        except FileNotFoundError:
            continue
mg.head()

Unnamed: 0,Model,Question,Answers
8,"German, English, Italian, French, German, Japa...",the movies that share actors with the movie [W...,"{german, swedish, french, english}"
14,"1963, 1990",when did the movies starred by [Spencer's Moun...,"{1942, 1966, 1955, 1952, 1947, 1998, 1981, 195..."
27,"Joseph L. Mankiewicz, Cameron Crowe, James McT...",who are the directors of films whose writers a...,"{marc webb, michael bay}"
31,"1956, 1990, 1975, 1984, 1950",what were the release dates of films starred b...,"{1966, 1955, 1953, 1994, 1957, 1972, 1958, 199..."
47,"English, Italian, German, Hebrew, Brazilian Po...",the films that share directors with the film [...,{german}


In [152]:
for hop in mqa_eval.hops:
    q_df = mqa_eval.test_sets[hop].copy()
    for r_set in result_set_paths:
        path = r_set.format(hop=hop)
        try:
            r_df = pd.read_csv(path)
            mg = r_df.merge(q_df, how="left", left_index=True, right_index=True).dropna()
            # mg["Model"].to_csv(path.replace("/full", ""))
        except FileNotFoundError:
            continue
        mg.fillna("", inplace=True)
        mg["Model"] = mg.apply(lambda t: set(t.Model.lower().split("|")), axis=1)
        # # compute rouge-match and correctness
        # # print(result_df)
        mg["rouge-l"] = mg.apply(lambda t: mqa_eval.get_rouge_score_for_answers(t.Answers, t.Model), axis=1)
        mg["Correct"] = mg.apply(lambda t: t["rouge-l"] >= 0.5, axis=1)
        break
    break
mg.Correct.sum() / len(mg)

0.388