In [53]:
from zeno_client import ZenoClient, ZenoMetric
import pandas as pd
import json
import os
from dotenv import load_dotenv

load_dotenv(override=True)

False

In [54]:
root_dir = '/data/user_data/jhsia2/dbqa'
results_dir = os.path.join(root_dir, 'reader_results')

client = ZenoClient('zen_EZ7LuqItWgObcQmIvNZVytvhtTh8JMs2HrSzzfXsiIg')


In [55]:
with open(os.path.join(root_dir, 'data', "gold_nq_zeno_file.json"), "r") as f:
    gold_data = json.load(f)
gold_data = sorted(gold_data, key=lambda x: x["id"])

In [56]:
def combine_gold_and_compiled(output_data, gold_data):
    for od, gd in zip(output_data, gold_data):
        od['gold_answer_set'] = gd['output']['answer_set']
        od['gold_context'] = {'wiki_id_set': gd['output']['wiki_id_set'],\
                              'wiki_par_id_set': gd['output']['wiki_par_id_set']}
    return output_data

In [65]:
project = client.create_project(
    name="Document QA",
    view={
        "data": {"type": "text", 
                 "label": "question:"
                },
        "label": {"type": "text"},
        "output": {
            "type": "vstack",
            "keys": {
                "gold answer set": {"type": "text", "label": "gold answer set: "},
                "answer": {"type": "text", "label": "reader answer: "},
                "retrieved context": {
                    "type": "list",
                    "elements": {
                        "type": "vstack",
                        "keys": {
                            "score": {"type": "text", "label": "score: "},
                            "wiki_id": {"type": "markdown"},
                            "text": {"type": "text", "label": "text: "},
                            "wiki_id_match": {"type": "text", "label": "wiki_id match: "},
                            "wiki_par_id_match": {"type": "text", "label": "wiki_par_id match: "}
                        },
                    },
                    "collapsible": "bottom",
                    "border": True,
                    "pad": True,
                },
            },
        },
    },
    description="Document-grounded question answering with Wikipedia",
    metrics=[
        ZenoMetric(name="max retrieved score", type="mean", columns=["max_score"]),
        ZenoMetric(name="avg retrieved score", type="mean", columns=["avg_score"]),
        ZenoMetric(name="exact_match", type="mean", columns=["exact_match"]),
        ZenoMetric(name="f1", type="mean", columns=["f1"]),
        ZenoMetric(name="substring_match", type="mean", columns=["substring_match"]),
        ZenoMetric(name="any wiki_id_match", type="mean", columns=["any wiki_id_match"]),
        ZenoMetric(name="any wiki_par_id_match", type="mean", columns=["any wiki_par_id_match"]),
        ZenoMetric(name="avg wiki_id_match", type="mean", columns=["avg wiki_id_match"]),
        ZenoMetric(name="avg wiki_par_id_match", type="mean", columns=["avg wiki_par_id_match"]),
    ],
)

Successfully updated project.
Access your project at  https://hub.zenoml.com/project/jhsia2/Document%20QA


In [58]:
data_df = pd.DataFrame({"question": [d["input"] for d in gold_data], 'id': [d['id'] for d in gold_data]})
project.upload_dataset(data_df, id_column="id", data_column="question")

  0%|          | 0/1 [00:00<?, ?it/s]

Successfully uploaded data


In [59]:
with open(os.path.join(root_dir, 'data', "gold-nq-dev-kilt.json"), "r") as f:
    gold_data = json.load(f)
gold_data = sorted(gold_data, key=lambda x: x["id"])

In [67]:
reader_model = 'flanT5'
retriever_model = 'bm25'
dataset = 'nq'
top_ks= ["top1", "top2", "top3", "top5", "top10", "top20", "top30", "top50"]
for top_k in top_ks:
    print(top_k)
    with open(os.path.join(results_dir, reader_model, dataset, retriever_model, f"exp2/{top_k}/reader_results_zeno.json"), "r") as f:
        data = json.load(f)
    combined_data = combine_gold_and_compiled(data, gold_data)
    output_df = get_reader_df(combined_data)
    # break
    
    project.upload_system(
        output_df, name= (dataset + ' ' + retriever_model + ' ' + reader_model + ' ' + top_k), id_column="id", output_column="output"
    )

top1


  0%|          | 0/3 [00:00<?, ?it/s]

Successfully uploaded system
top2


  0%|          | 0/4 [00:00<?, ?it/s]

Successfully uploaded system
top3


  0%|          | 0/6 [00:00<?, ?it/s]

Successfully uploaded system
top5


  0%|          | 0/10 [00:00<?, ?it/s]

Successfully uploaded system
top10


  0%|          | 0/17 [00:00<?, ?it/s]

Successfully uploaded system
top20


  0%|          | 0/30 [00:00<?, ?it/s]

Successfully uploaded system
top30


  0%|          | 0/52 [00:00<?, ?it/s]

Successfully uploaded system
top50


  0%|          | 0/82 [00:00<?, ?it/s]

Successfully uploaded system


In [64]:
reader_model = 'flanT5'
retriever_model = 'bm25'
dataset = 'nq'
top_k = 'baseline'
print(top_k)
with open(os.path.join(results_dir, reader_model, dataset, retriever_model, f"exp2/{top_k}/reader_results_zeno.json"), "r") as f:
    data = json.load(f)
combined_data = combine_gold_and_compiled(data, gold_data)
output_df = get_baseline_df(combined_data)

project.upload_system(
    output_df, name= (dataset + ' ' + reader_model + ' ' + top_k), id_column="id", output_column="output"
    )

baseline


  0%|          | 0/2 [00:00<?, ?it/s]

Successfully uploaded system


In [60]:
def get_baseline_df(combined_data):
    return pd.DataFrame(
        {
            "question": [d['input'] for d in combined_data],
            "id": [d['id'] for d in combined_data],
            "output": [
                json.dumps(
                    {   
                        "gold answer set": ', '.join(d['gold_answer_set']),
                        "gold context": d['gold_context'],
                        "answer": d["output"]["answer"],
                        "retrieved context": [
                            {
                                "wiki_par_id": None,
                                "text": None,
                                "score": None,
                                "wiki_id_match": None,
                                "wiki_par_id_match": None
                            }
                        ],
                    }
                )
                for d in combined_data
            ],
            "max_score": [

                None for d in combined_data
            ],
            "avg_score": [
                None for d in combined_data
            ],
            "f1": [
                d["output"]["answer_evaluation"]["f1"] for d in combined_data
            ],
            "exact_match": [
                d["output"]["answer_evaluation"]["exact_match"] for d in combined_data
            ],
            "substring_match": [
                d["output"]["answer_evaluation"]["substring_match"] for d in combined_data
            ],
            "any wiki_id_match": [
                None for d in combined_data
            ],
            "any wiki_par_id_match": [
                None for d in combined_data
            ]
            "avg wiki_id_match": [
                None for d in combined_data
            ],
            "avg wiki_par_id_match": [
                None for d in combined_data
            ]
        }
    )
    # return output_df
# output_df["id"] = 

In [66]:
import numpy as np
def get_reader_df(combined_data):
    return pd.DataFrame(
        {
            "question": [d['input'] for d in combined_data],
            "id": [d['id'] for d in combined_data],
            "output": [
                json.dumps(
                    {   
                        "gold answer set": ', '.join(d['gold_answer_set']),
                        "gold context": d['gold_context'],
                        "answer": d["output"]["answer"],
                        "retrieved context": [
                            {
                                "wiki_id": "[{idx}]({url})".format(
                                    idx=r["wiki_id"],
                                    url="https://en.wikipedia.org/?curid="
                                    + r["wiki_id"],
                                ),
                                "text": r["text"],
                                "score": r["score"],
                                "wiki_id_match": r["wiki_id_match"],
                                "wiki_par_id_match": r["wiki_par_id_match"]
                            }
                        for r in d["output"]["retrieved"]
                        ],
                    }
                )
                for d in combined_data
            ],
            "max_score": [

                d["output"]["retrieved"][0]["score"] for d in combined_data
            ],
            "avg_score": [
                np.mean([r["score"] for r in d["output"]["retrieved"]]) for d in combined_data
            ],
            "f1": [
                d["output"]["answer_evaluation"]["f1"] for d in combined_data
            ],
            "exact_match": [
                d["output"]["answer_evaluation"]["exact_match"] for d in combined_data
            ],
            "substring_match": [
                d["output"]["answer_evaluation"]["substring_match"] for d in combined_data
            ],
            "avg wiki_id_match": [
                np.mean([r["wiki_id_match"] for r in d["output"]["retrieved"]]) for d in combined_data
            ],
            "avg wiki_par_id_match": [
                np.mean([r["wiki_par_id_match"] for r in d["output"]["retrieved"]]) for d in combined_data
            ],
            "any wiki_id_match": [
                d["output"]["summary context evaluation"]["wiki_id_match"] for d in combined_data
            ],
            "any wiki_par_id_match": [
                d["output"]["summary context evaluation"]["wiki_par_id_match"] for d in combined_data
            ]
        }
    )
    # return output_df
# output_df["id"] = 

[]