In [None]:
import pandas as pd
import os
import json
from glob import glob
from tqdm import tqdm

with open("../data/qas_v2.json") as f:
    qa_dict = {item["ID"]: item for item in json.load(f)}

paths = glob("../output/**/*.json", recursive=True)
ocr_types = ["gt"]
ret_df = []
gen_df = []
end_df = []
for path in tqdm(paths):
    if "gpt-4o" in path or "qwen2_72b" in path:
        continue
    basename = os.path.basename(path).removesuffix(".json")
    ocr_type = os.path.basename(os.path.dirname(path))
    if ocr_type not in ocr_types:
        continue
    with open(path) as f:
        data = json.load(f)
    if "/retrieval/" in path:
        ret = basename.split("_")[1]
        llm = ""
        df = ret_df
    elif "/generation/" in path:
        ret = ""
        llm = "_".join(basename.split("_")[-2:])
        df = gen_df
    elif "/end2end/" in path:
        ret = basename.split("_")[1]
        llm = "_".join(basename.split("_")[-2:])
        df = end_df
    df.extend({
        "id": item["id"],
        "ocr_type": ocr_type,
        "ret": ret,
        "llm": llm,
        "domain": qa_dict[item["id"]]["doc_type"],
        "doc_name": qa_dict[item["id"]]["doc_name"].split("/")[-1],
        "evidence_source": qa_dict[item["id"]]["evidence_source"],
        "answer_form": qa_dict[item["id"]]["answer_form"],
        **item["metrics"]
    } for item in data["results"] if item["id"] in qa_dict)

In [2]:
end_df = pd.DataFrame(end_df)
gen_df = pd.DataFrame(gen_df)
ret_df = pd.DataFrame(ret_df)

# Gen Metrics

In [3]:
import pandas as pd
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)

keys = ["ocr_type", "ret", "llm", "domain", "doc_name", "evidence_source", "answer_form"]

In [4]:
from copy import deepcopy

def show_types(input_df, ocr_types, metric="F1", domain=False,):
    grouby = "evidence_source"
    if domain:
        grouby = "domain"
    from copy import deepcopy
    import pandas as pd
    
    input_df = deepcopy(input_df)

    input_df[metric] = input_df[metric] * 100
    
    evidence_order = {'text': 0, 'table': 1, 'formula': 2, 'chart': 3, 'reading_order': 4, 'multi': 5, 'all': 6}
    df_filtered = input_df[input_df["ocr_type"].isin(ocr_types)]
    result = (
        df_filtered[keys + [metric]]
        .groupby(["ocr_type", grouby])
        .agg(
            mean_metric=(metric, 'mean'),
            count=(metric, 'count')
        )
    )
    overall = (
        df_filtered[keys + [metric]]
        .groupby("ocr_type")
        .agg(
            mean_metric=(metric, 'mean'),
            count=(metric, 'count')
        )
        .reset_index()
    )
    overall[grouby] = "all"
    overall = overall.set_index(["ocr_type", grouby])
    
    final_result = pd.concat([result, overall])
    final_result = final_result.reset_index()
    final_result['evidence_order_value'] = final_result[grouby].map(evidence_order)
    final_result = final_result.sort_values(by=['ocr_type', 'evidence_order_value'])
    final_result = final_result.drop(columns=['evidence_order_value']).set_index(['ocr_type', grouby])
    
    return final_result

In [None]:
display(show_types(ret_df, ocr_types, "lcs", domain=False).round(1).pivot_table(index='evidence_source', columns='ocr_type', values='mean_metric', aggfunc=lambda x: x))

In [None]:
display(show_types(gen_df, ocr_types, domain=False).round(1).pivot_table(index='evidence_source', columns='ocr_type', values='mean_metric', aggfunc=lambda x: x))

In [None]:
display(show_types(end_df, ocr_types, domain=False).round(1).pivot_table(index='evidence_source', columns='ocr_type', values='mean_metric', aggfunc=lambda x: x))