In [None]:
import pickle
import re
from typing import Any, TypedDict

import pandas as pd

In [None]:
class CacheValue(TypedDict):
    """Cache value for a tuple of (question, answer, narrative).

    val_annotations: list of Likert annotation value for the respective source.
    annotation_sources: name of the source model from which the evaluated is.

    Both lists are always of the same size, but that size varies.
    """

    val_annotations: list[int]
    annotation_sources: list[str]


# The cache. The strings in the key tuple must be lower cased.
Cache = dict[tuple[str, str, str], CacheValue]

In [None]:
def load_cache(cache_fname: str) -> Cache:
    with open(cache_fname, "rb") as fp:
        return pickle.load(fp)


def retrieve_from_cache(
    cache: Cache, question: str, answer: str, narrative: str
) -> CacheValue:
    """
    Use question, answer and narrative to retrieve all associated values
    Return failure if key not found
    """

    key = (question.lower(), answer.lower(), narrative.lower())
    try:
        return cache[key]
    except:
        return {"message": "Key not found"}

In [None]:
cache = load_cache("../artifacts/human_eval_cache.pkl")

In [None]:
cache2 = []
for (q, a, n), v in cache.items():
    cache2.append({'question': q, 'answer': a, 'narrative' : n, 'val_anns': v['val_annotations'], 'ann_src': v['annotation_sources']})
len(cache2)

In [None]:
df = pd.DataFrame(cache2)
df

In [None]:
inputs  = set()
for _, row in df.iterrows():
    inputs.add(row['question'] + row['narrative'] + row['answer'])
len(inputs)

In [None]:
from collections import Counter

c = Counter([src for v in cache.values() for src in v['annotation_sources']])
sum(c.values())

In [None]:
def remove_punctuation(text: str) -> str:
    return re.sub(r"[^\w\s]", "", text)

In [None]:
def get_likert_scores(row: pd.Series, cache: Cache) -> list[int]:
    answer = row["predicted_answer"] if "predicted_answer" in row else row["answer"]
    answer = remove_punctuation(answer)

    question = row["question"]
    narrative = row["narrative"]
    
    info = cache[question.lower(), answer.lower(), narrative.lower()]
    return info["val_annotations"]

In [None]:
def overall_avg_likert(df: pd.DataFrame, cache: Cache) -> None:
    likerts: list[float] = []
    for idx, row in df.iterrows():
        likertscores = get_likert_scores(row, cache)
        likerts.append(sum(likertscores) / len(likertscores))

    avg = sum(likerts) / len(likerts)
    print(f"Overall avg Likert for all answers {round(avg, 2)}")

In [None]:
def overall_avg_binary_likert(df: pd.DataFrame, cache: Cache) -> None:
    likerts: list[float] = []
    for idx, row in df.iterrows():
        likertscores = get_likert_scores(row, cache)
        binary_likertscores = [0 if x < 1 else 1 for x in likertscores]
        likerts.append(sum(binary_likertscores) / len(binary_likertscores))

    avg = sum(likerts) / len(likerts)
    print(f"Overall avg binary Likert for all answers: {round(avg, 2)}")

In [None]:
def get_all_numbers(df: pd.DataFrame, cache: Cache) -> None:
    overall_avg_likert(df, cache)
    overall_avg_binary_likert(df, cache)

In [None]:
def evaluate_df_by_onto(df: pd.DataFrame, cache: Cache) -> None:
    conseq_df = df[df["onto"] == "Consequence"]
    print("Consequence")
    get_all_numbers(conseq_df, cache)

    goal_df = df[df["onto"] == "Goal seeking"]
    print("Goal seeking")
    get_all_numbers(goal_df, cache)

    reac_df = df[df["onto"] == "Reactionary"]
    print("Reactionary")
    get_all_numbers(reac_df, cache)

    desire_df = df[df["onto"] == "Desire"]
    print("Desire")
    get_all_numbers(desire_df, cache)

    other_df = df[df["onto"] == "Other"]
    print("Other")
    get_all_numbers(other_df, cache)

In [None]:
ontology_df = pd.read_csv("../artifacts/hidden_test_set_ontology.csv")

In [None]:
meta_to_ontology_dict = {
    row['question_meta']: row['Ontology'] for _, row in ontology_df.iterrows()
}

In [None]:
def add_onto_to_df(df: pd.DataFrame, meta_to_ontology_dict: dict[str, str]) -> pd.DataFrame:
    ontos: list[str] = []
    for idx, row in df.iterrows():
        try:
            ontos.append(meta_to_ontology_dict[row["question_meta"]])
        except:
            ontos.append(meta_to_ontology_dict[row["meta"]])
    df["onto"] = ontos
    return df

In [None]:
onto_count_dict = ontology_df["Ontology"].value_counts().to_dict()

In [None]:
onto_count_dict

In [None]:
t5_df = pd.read_csv("../artifacts/model_predictions/t5base_w_n_separator.csv")
t5_df = add_onto_to_df(t5_df, meta_to_ontology_dict)
get_all_numbers(t5_df, cache)

In [None]:
impl_t5_df = t5_df[t5_df["is_ques_answerable"] == "Not Answerable"]
get_all_numbers(impl_t5_df, cache)

In [None]:
evaluate_df_by_onto(impl_t5_df, cache)

In [None]:
t5_knowl_df = pd.read_csv(
    "../artifacts/model_predictions/t5base_w_n_separator_w_knowl.csv"
)
t5_knowl_df = add_onto_to_df(t5_knowl_df, meta_to_ontology_dict)
get_all_numbers(t5_knowl_df, cache)

In [None]:
impl_t5_knowl_df = t5_knowl_df[t5_knowl_df["is_ques_answerable"] == "Not Answerable"]
get_all_numbers(impl_t5_knowl_df, cache)

In [None]:
evaluate_df_by_onto(impl_t5_knowl_df, cache)

In [None]:
t511b_df = pd.read_csv("../artifacts/model_predictions/t511b_w_n_separator.csv")
t511b_df = add_onto_to_df(t511b_df, meta_to_ontology_dict)
get_all_numbers(t511b_df, cache)

In [None]:
impl_t511b_df = t511b_df[t511b_df["is_ques_answerable"] == "Not Answerable"]
get_all_numbers(impl_t511b_df, cache)

In [None]:
evaluate_df_by_onto(impl_t511b_df, cache)

In [None]:
# this file is the t511b with top 3 diverse comet verbalized
t511b_knowl_df = pd.read_csv(
    "../artifacts/model_predictions/t511b_w_n_separator_w_knowl.csv"
)
t511b_knowl_df = add_onto_to_df(t511b_knowl_df, meta_to_ontology_dict)
get_all_numbers(t511b_knowl_df, cache)

In [None]:
impl_t511b_knowl_df = t511b_knowl_df[
    t511b_knowl_df["is_ques_answerable"] == "Not Answerable"
]
get_all_numbers(impl_t511b_knowl_df, cache)

In [None]:
evaluate_df_by_onto(impl_t511b_knowl_df, cache)

In [None]:
gpt3_df = pd.read_csv("../artifacts/model_predictions/gpt3.csv")
gpt3_df = add_onto_to_df(gpt3_df, meta_to_ontology_dict)
get_all_numbers(gpt3_df, cache)

In [None]:
impl_gpt3_df = gpt3_df[gpt3_df["is_ques_answerable"] == "Not Answerable"]
get_all_numbers(impl_gpt3_df, cache)

In [None]:
evaluate_df_by_onto(impl_gpt3_df, cache)

In [None]:
gpt3_knowl_df = pd.read_csv("../artifacts/model_predictions/gpt3_w_knowl.csv")
gpt3_knowl_df = add_onto_to_df(gpt3_knowl_df, meta_to_ontology_dict)
get_all_numbers(gpt3_knowl_df, cache)

In [None]:
impl_gpt3_knowl_df = gpt3_knowl_df[
    gpt3_knowl_df["is_ques_answerable"] == "Not Answerable"
]
get_all_numbers(impl_gpt3_knowl_df, cache)

In [None]:
evaluate_df_by_onto(impl_gpt3_knowl_df, cache)

# Model Setup

## Base

In [None]:
print("Gtup top3")
t5_df = pd.read_csv("../artifacts/model_predictions/t5base_tup_top3_diverse.csv")
t5_df = add_onto_to_df(t5_df, meta_to_ontology_dict)
get_all_numbers(t5_df, cache)

In [None]:
impl_t5_df = t5_df[t5_df["is_ques_answerable"] == "Not Answerable"]
get_all_numbers(impl_t5_df, cache)

In [None]:
print("Gtupsep top3")
t5_df = pd.read_csv("../artifacts/model_predictions/t5base_tupsep_top3_diverse.csv")
t5_df = add_onto_to_df(t5_df, meta_to_ontology_dict)
get_all_numbers(t5_df, cache)

In [None]:
impl_t5_df = t5_df[t5_df["is_ques_answerable"] == "Not Answerable"]
get_all_numbers(impl_t5_df, cache)

In [None]:
print("Gverb. top1")
t5_df = pd.read_csv("../artifacts/model_predictions/t5base_verb_top1_diverse.csv")
t5_df = add_onto_to_df(t5_df, meta_to_ontology_dict)
get_all_numbers(t5_df, cache)

In [None]:
impl_t5_df = t5_df[t5_df["is_ques_answerable"] == "Not Answerable"]
get_all_numbers(impl_t5_df, cache)

In [None]:
print("Gverb. top5 diverse")
t5_df = pd.read_csv("../artifacts/model_predictions/t5base_verb_top5_diverse.csv")
t5_df = add_onto_to_df(t5_df, meta_to_ontology_dict)
get_all_numbers(t5_df, cache)

In [None]:
impl_t5_df = t5_df[t5_df["is_ques_answerable"] == "Not Answerable"]
get_all_numbers(impl_t5_df, cache)

In [None]:
print("Gverb. top3 original")
t5_df = pd.read_csv("../artifacts/model_predictions/t5base_verb_top3_original.csv")
t5_df = add_onto_to_df(t5_df, meta_to_ontology_dict)
get_all_numbers(t5_df, cache)

In [None]:
impl_t5_df = t5_df[t5_df["is_ques_answerable"] == "Not Answerable"]
get_all_numbers(impl_t5_df, cache)

In [None]:
print("Gverb. top3 diverse")
t5_df = pd.read_csv("../artifacts/model_predictions/t5base_verb_top3_diverse.csv")
t5_df = add_onto_to_df(t5_df, meta_to_ontology_dict)
get_all_numbers(t5_df, cache)

In [None]:
impl_t5_df = t5_df[t5_df["is_ques_answerable"] == "Not Answerable"]
get_all_numbers(impl_t5_df, cache)

In [None]:
print("Gverb. top3 reranked")
t5_df = pd.read_csv("../artifacts/model_predictions/t5base_verb_top3_reranked.csv")
t5_df = add_onto_to_df(t5_df, meta_to_ontology_dict)
get_all_numbers(t5_df, cache)

In [None]:
impl_t5_df = t5_df[t5_df["is_ques_answerable"] == "Not Answerable"]
get_all_numbers(impl_t5_df, cache)

In [None]:
print("T5 Appendix D.3 format - no separator")
t5_df = pd.read_csv("../artifacts/model_predictions/t5base.csv")
t5_df = add_onto_to_df(t5_df, meta_to_ontology_dict)
get_all_numbers(t5_df, cache)

In [None]:
impl_t5_df = t5_df[t5_df["is_ques_answerable"] == "Not Answerable"]
get_all_numbers(impl_t5_df, cache)

## 11B

In [None]:
print("Gtup top3")
t5_df = pd.read_csv("../artifacts/model_predictions/t511b_tup_top3_diverse.csv")
t5_df = add_onto_to_df(t5_df, meta_to_ontology_dict)
get_all_numbers(t5_df, cache)

In [None]:
impl_t5_df = t5_df[t5_df["is_ques_answerable"] == "Not Answerable"]
get_all_numbers(impl_t5_df, cache)

In [None]:
print("Gtupsep top3")
t5_df = pd.read_csv("../artifacts/model_predictions/t511b_tupsep_top3_diverse.csv")
t5_df = add_onto_to_df(t5_df, meta_to_ontology_dict)
get_all_numbers(t5_df, cache)

In [None]:
impl_t5_df = t5_df[t5_df["is_ques_answerable"] == "Not Answerable"]
get_all_numbers(impl_t5_df, cache)

In [None]:
print("Gverb. top1")
t5_df = pd.read_csv("../artifacts/model_predictions/t511b_verb_top1_diverse.csv")
t5_df = add_onto_to_df(t5_df, meta_to_ontology_dict)
get_all_numbers(t5_df, cache)

In [None]:
impl_t5_df = t5_df[t5_df["is_ques_answerable"] == "Not Answerable"]
get_all_numbers(impl_t5_df, cache)

In [None]:
print("Gverb. top5 diverse")
t5_df = pd.read_csv("../artifacts/model_predictions/t511b_verb_top5_diverse.csv")
t5_df = add_onto_to_df(t5_df, meta_to_ontology_dict)
get_all_numbers(t5_df, cache)

In [None]:
impl_t5_df = t5_df[t5_df["is_ques_answerable"] == "Not Answerable"]
get_all_numbers(impl_t5_df, cache)

In [None]:
print("Gverb. top3 original")
t5_df = pd.read_csv("../artifacts/model_predictions/t511b_verb_top3_original.csv")
t5_df = add_onto_to_df(t5_df, meta_to_ontology_dict)
get_all_numbers(t5_df, cache)

In [None]:
impl_t5_df = t5_df[t5_df["is_ques_answerable"] == "Not Answerable"]
get_all_numbers(impl_t5_df, cache)

In [None]:
print("Gverb. top3 diverse")
t5_df = pd.read_csv("../artifacts/model_predictions/t511b_verb_top3_diverse.csv")
t5_df = add_onto_to_df(t5_df, meta_to_ontology_dict)
get_all_numbers(t5_df, cache)

In [None]:
impl_t5_df = t5_df[t5_df["is_ques_answerable"] == "Not Answerable"]
get_all_numbers(impl_t5_df, cache)

In [None]:
print("Gverb. top3 reranked")
t5_df = pd.read_csv("../artifacts/model_predictions/t511b_verb_top3_reranked.csv")
t5_df = add_onto_to_df(t5_df, meta_to_ontology_dict)
get_all_numbers(t5_df, cache)

In [None]:
impl_t5_df = t5_df[t5_df["is_ques_answerable"] == "Not Answerable"]
get_all_numbers(impl_t5_df, cache)

In [None]:
print("T5 Appendix D.3 format - no separator")
t5_df = pd.read_csv("../artifacts/model_predictions/t511b.csv")
t5_df = add_onto_to_df(t5_df, meta_to_ontology_dict)
get_all_numbers(t5_df, cache)

In [None]:
impl_t5_df = t5_df[t5_df["is_ques_answerable"] == "Not Answerable"]
get_all_numbers(impl_t5_df, cache)