In [229]:
from lxml import etree as et
from sklearn.metrics import classification_report
import numpy as np
import os
import pandas as pd
import json
import csv

from utils import *

DEBUG = False

In [222]:
def get_pmid(docs):
    documents = [
        document.split("/")[-1] for document in docs
    ]
    return documents

In [223]:
def get_answers(answers_files):
    all_answers = dict()
    for a in answers_files:
        with open(a, "r") as f:
            d = json.loads(f.read())
            if DEBUG:
                print(f"{len(d)} answers found in {a}")
            for key, value in d.items():
                if key in all_answers.keys():
                    print(f"MULTIPLE ANSWERS FOR {key}")
                if isinstance(value,list):
                    all_answers[key] = value[0] # get the first value which is answer not prediction for the yes/no
                else:
                    all_answers[key] = value
    return all_answers

def get_three_files(a_dir):
    return [
        a_dir + "/factoid/predictions.json",
        a_dir + "/list/predictions.json",
        a_dir + "/yesno/predictions.json",
    ]

In [224]:

def parse_xml(xml_file, dir_for_qa): 
    no_answers = 0
    # get answers
    qa_answers = get_answers(get_three_files(dir_for_qa))
    # get ir and qu
    df_cols = ['id','human_concepts','documents','full_abstracts','titles','type', 'exact_answer']
    xtree = et.parse(xml_file)
    xroot = xtree.getroot()
    rows = []
    for question in xroot: 
        id = question.attrib.get("id")
        ir = question.find("IR")
        qp = question.find("QP")
        concepts = [e.text for e in qp.findall("Entities")]
        qa_type = qp.find("Type").text
        titles =  [e.find("Title").text for e in ir.findall("Result")]
        abstracts =  [e.find("Abstract").text for e in ir.findall("Result")]
        pmids = [e.get("PMID") for e in ir.findall("Result")]
        exact_answer = qa_answers[id] if id in qa_answers else None
        if DEBUG and not exact_answer:
            print(f"id [{id}] has no answer")
            no_answers +=1
        rows.append({"id":id,"human_concepts":concepts,"documents":pmids,"full_abstracts":abstracts,"titles":titles,"type":qa_type,'exact_answer':exact_answer})
    out_df = pd.DataFrame(rows, columns=df_cols)
    if DEBUG:
        print(f"[{no_answers}/{len(out_df)}] questions had answers")
    return out_df


In [225]:
# Set up the golden answer dataframe
golden_dataset_path = "testing_datasets/augmented_concepts_abstracts_titles.json"
generated_qu = "tmp/ir/output/bioasq_qa.xml"
with open(golden_dataset_path,'r') as f:
    gold_data = json.loads(f.read())
# load and flatten data
gold_df = pd.json_normalize(gold_data,record_path="questions")
# get gold df
gold_df['documents'] = gold_df['documents'].apply(get_pmid)
# get generated df
gen_df = parse_xml(generated_qu,'tmp/qa')


In [234]:
def print_yes_no_info(df, tag):
    print(tag)
    print(f" [{len(df)}] {tag} Yes/No Questions")
    yes_df = df[df['exact_answer'] == 'yes']
    no_df = df[df['exact_answer'] == 'no']
    print(f" [{len(yes_df)}] {tag} Yes Questions")
    print(f" [{len(no_df)}] {tag} No Questions")


""" f1 Yes
    tp is yes in both
    fp is number of 'yes' in generated when gold is 'no'
    fn is number of 'no' in generated when gold is 'yes'

    f1 No
    tp is No in both
    fp is number of 'no' in generated when gold is 'yes'
    fn is number of 'yes' in generated when gold is 'no'

    IGNORE if the predicted type is yes/no but gold type is different
"""
def do_yes_no_eval(gold_df,gen_df):
    print("Yes/No Evaluation")
    yes_no_gold_df = gold_df[gold_df['type'] == 'yesno']
    yes_no_gen_df = gen_df[gen_df['type'] == 'yesno']
    # Gold stats
    print_yes_no_info(yes_no_gold_df, "Gold")
    # Gen Stats
    print_yes_no_info(yes_no_gen_df, "Generated")

    gold_answers = yes_no_gold_df.loc[:,['id','exact_answer']].copy()
    gen_answers = gen_df.loc[:,['id','exact_answer']].copy() # grabbing all because there could be incorrect type guesses
    
    # YES SIDE
    gold = gold_answers.to_dict(orient='list')
    gen = gen_answers.to_dict(orient='list')
    print(len(gold))

    # NO SIDE
    return ""

yes_no_results = do_yes_no_eval(gold_df,gen_df)
#print(yes_no_results)


# so if the gold_gen[id].exact_answer == gen[exact_answer], tp ++ 


Yes/No Evaluation
Gold
 [881] Gold Yes/No Questions
 [704] Gold Yes Questions
 [177] Gold No Questions
Generated
 [909] Generated Yes/No Questions
 [0] Generated Yes Questions
 [523] Generated No Questions
2


In [232]:

# Get classification reports
print("Report for type evaluation step")
type_report = classification_report(gold_df['type'].to_numpy(),gen_df['type'].to_numpy(),output_dict=DEBUG)
print(type_report)

# yes_no_results = do_yes_no_eval(gold_df,gen_df)

Report for type evaluation step
              precision    recall  f1-score   support

     factoid       0.93      0.84      0.88       941
        list       0.94      0.95      0.95       644
     summary       0.84      0.91      0.87       777
       yesno       0.97      1.00      0.98       881

    accuracy                           0.92      3243
   macro avg       0.92      0.92      0.92      3243
weighted avg       0.92      0.92      0.92      3243



In [228]:
yesn_no_gold_df = gold_df[gold_df['type'] == 'yesno']
factoid_gold_df = gold_df[gold_df['type'] == 'factoid']
list_gold_df = gold_df[gold_df['type'] == 'list']

yesn_no_gen_df = gen_df[gen_df['type'] == 'yesno']
factoid_gen_df = gen_df[gen_df['type'] == 'factoid']
list_gen_df = gen_df[gen_df['type'] == 'list']
