In [526]:
from lxml import etree as et
from sklearn.metrics import classification_report
import numpy
import os
import pandas as pd
import json
import csv

from utils import *

DEBUG = False

In [527]:
def get_pmid(docs):
    documents = [
        document.split("/")[-1] for document in docs
    ]
    return documents

In [528]:
def get_answers(answers_files):
    all_answers = dict()
    for a in answers_files:
        with open(a, "r") as f:
            d = json.loads(f.read())
            if DEBUG:
                print(f"{len(d)} answers found in {a}")
            for key, value in d.items():
                if key in all_answers.keys():
                    print(f"MULTIPLE ANSWERS FOR {key}")
                if isinstance(value,list):
                    all_answers[key] = value[0] # get the first value which is answer not prediction for the yes/no
                else:
                    all_answers[key] = value
    return all_answers

def get_three_files(a_dir):
    return [
        a_dir + "/factoid/predictions.json",
        a_dir + "/list/predictions.json",
        a_dir + "/yesno/predictions.json",
    ]

def get_col_list(gold_df,gen_df,col):
    gold_col = gold_df.loc[:,['id',col]].copy()
    gen_col = gen_df.loc[:,['id',col]].copy() 
    
    gold = gold_col.to_dict(orient='list')
    gen = gen_col.to_dict(orient='list')
    gen_ids = gen['id']
    gen_vals = gen[col]
    gold_ids = gold['id']
    gold_vals = gold[col]
    return gold_ids,gold_vals,gen_ids,gen_vals

In [529]:

def parse_xml(xml_file, dir_for_qa): 
    no_answers = 0
    # get answers
    qa_answers = get_answers(get_three_files(dir_for_qa))
    # get ir and qu
    df_cols = ['id','human_concepts','documents','full_abstracts','titles','type', 'exact_answer']
    xtree = et.parse(xml_file)
    xroot = xtree.getroot()
    rows = []
    for question in xroot: 
        id = question.attrib.get("id")
        ir = question.find("IR")
        qp = question.find("QP")
        concepts = [e.text for e in qp.findall("Entities")]
        qa_type = qp.find("Type").text
        titles =  [e.find("Title").text for e in ir.findall("Result")]
        abstracts =  [e.find("Abstract").text for e in ir.findall("Result")]
        pmids = [e.get("PMID") for e in ir.findall("Result")]
        exact_answer = qa_answers[id] if id in qa_answers else None
        if DEBUG and not exact_answer:
            print(f"id [{id}] has no answer")
            no_answers +=1
        rows.append({"id":id,"human_concepts":concepts,"documents":pmids,"full_abstracts":abstracts,"titles":titles,"type":qa_type,'exact_answer':exact_answer})
    out_df = pd.DataFrame(rows, columns=df_cols)
    if DEBUG:
        print(f"[{no_answers}/{len(out_df)}] questions had answers")
    return out_df


In [531]:
def print_yes_no_info(df, tag):
    print(tag)
    print(f" [{len(df)}] {tag} Yes/No Questions")
    yes_df = df[df['exact_answer'] == 'yes']
    no_df = df[df['exact_answer'] == 'no']
    print(f" [{len(yes_df)}] {tag} Yes Questions")
    print(f" [{len(no_df)}] {tag} No Questions")


""" f1 Yes
    tp is gen 'yes' | gold 'yes'
    fp is gen 'yes' | gold 'no'
    fn is gen 'no' |  gold 'yes'

    f1 No
    tp is gen 'no' | gold 'no'
    fp is gen 'no' | gold 'yes'
    fn is gen 'yes' |  gold 'no'

    IGNORE if the predicted type is yes/no but gold type is different
"""
def do_yes_no_eval(gold_df,gen_df):
    print("Yes/No Evaluation")
    yes_no_gold_df = gold_df[gold_df['type'] == 'yesno']
    yes_no_gen_df = gen_df[gen_df['type'] == 'yesno']
    
    if DEBUG:
        # Gold stats
        print_yes_no_info(yes_no_gold_df, "Gold")
        # Gen Stats
        print_yes_no_info(yes_no_gen_df, "Generated")

    gold_ids,gold_ans,gen_ids,gen_ans = get_col_list(yes_no_gold_df,gen_df,'exact_answer')

    # YES 
    ytp = 0
    yfp = 0
    yfn = 0

    for i in range (len(gold_ids)):
        gold_val = gold_ans[i]
        gen_val = gen_ans[gen_ids.index(gold_ids[i])]
        if gen_val:
            if gold_val == 'yes':
                if gen_val =='yes':
                    ytp += 1
                elif gen_val =='no':
                    yfn += 1
                else:
                    if DEBUG:
                        print(f"yes question [{gold_ids[i]}] had generated answer {gen_val}")
            elif gold_val == 'no':
                if gen_val == 'yes':
                    yfp +=1
                elif gen_val =='no':
                    pass #handled by no f1
                else:
                    if DEBUG:
                        print(f"no question [{gold_ids[i]}] had generated answer {gen_val}")
            else:
                print(f"GOLDEN answer to yes/no question [{gold_ids[i]}] was {gold_val}")
                
        else: # not identified as yes/no question by generated
            pass
    #sanity check
    print (f"Yes | True Posative: {ytp}, False Posative: {yfp}, False Negative: {yfn}")
    try:
        yp = ytp/(ytp + yfp)
    except:
        yp = 0
    try:
        yr = ytp/(ytp + yfn)
    except:
        yr = 0
    try:
        yf1 = 2 * ((yp * yr)/(yp+yr))
    except:
        yf1 = 0
    print (f'Yes | f1 {yf1}, precision {yp}, recall {yr}')

    # NO SIDE
    ntp = 0
    nfp = 0
    nfn = 0

    for i in range (len(gold_ids)):
        gold_val = gold_ans[i]
        gen_val = gen_ans[gen_ids.index(gold_ids[i])]
        if gen_val:
            if gold_val == 'no':
                if gen_val =='no':
                    ntp += 1
                elif gen_val =='yes':
                    nfn += 1
                else:
                    if DEBUG:
                        print(f"no question [{gold_ids[i]}] had generated answer {gen_val}")
            elif gold_val == 'yes':
                if gen_val == 'no':
                    nfp +=1
                elif gen_val =='yes':
                    pass #handled by no f1
                else:
                    print(f"yes question [{gold_ids[i]}] had generated answer {gen_val}")
            else:
                if DEBUG:
                    print(f"GOLDEN answer to yes/no question [{gold_ids[i]}] was {gold_val}")
                
        else: # not identified as yes/no question by generated
            pass
    
    # sanity check
    print (f"No | True Posative: {ntp}, False Posative: {nfp}, False Negative: {nfn}")
    try:
        np = ntp/(ntp + nfp)
    except:
        np = 0
    try:
        nr = ntp/(ntp + nfn)
    except:
        nr = 0
    try:
        nf1 = 2 * ((np * nr)/(np+nr))
    except:
        nf1 = 0
    print (f'No | f1 {nf1}, precision {np}, recall {nr}')

    f1 = (yf1 + nf1)/2 
    p = (yp + np)/2 
    r = (yr + nr)/2 
    print(f"Overall Yes/No | f1 {f1}, precision {p}, recall {r}")
    print ('\n')
    return yf1,yp,yr,nf1,np,nr,f1

# yes_no_report = do_yes_no_eval(gold_df,gen_df)

In [532]:
# Compute [[Mean average precision, Geometric mean average precision]], precision, recall, f1 score
def do_concepts_eval(gold_df,gen_df):
    print("Concepts Evaluation")
    gold_ids,gold_cons,gen_ids,gen_cons = get_col_list(gold_df,gen_df,'human_concepts')
    num_gen_q_without_cons = 0
    num_gold_q_without_cons = 0
    tp = 0
    fp = 0
    fn = 0

    scores = []
    # for each question
    for i in range (len(gold_ids)):
        gold_val = gold_cons[i]
        if not isinstance(gold_val,list) or gold_val == []:
            num_gold_q_without_cons += 1
            continue
        gen_val = gen_cons[gen_ids.index(gold_ids[i])]
        # if concepts are found
        if gen_val != []:
            # TP is concept in Gold AND Gen
            # FP is concept NOT IN GOLD, but YES IN GEN
            # FN is concept IN Gold but NOT GEN

            # get unique concepts from both gold and gen
            unique_gold_cons = set(gold_val[0])
            unique_gen_cons = set(gen_val[0])
            for val in unique_gold_cons:
                if val in unique_gen_cons:
                    tp += 1
                elif val not in unique_gen_cons:
                    fn += 1
            for val in unique_gen_cons:
                if val not in unique_gold_cons:
                    fp += 1

            f1,p,r = get_f1_p_r(tp,fp,fn, tag ="Concepts")
            scores.append((f1,p,r))
        else: # There are no concepts retrieved for this document
            num_gen_q_without_cons += 1
            pass
    #sanity check
    print(f"[{len(gold_ids) - num_gold_q_without_cons}/{len(gold_ids)}] Questions have human readable concepts in gold dataset")
    print(f"[{len(gen_ids) - num_gen_q_without_cons}/{len(gen_ids)}] Questions have human readable concepts in generated dataset")

    # OVERALL SCORES
    f1_sum = p_sum = r_sum = 0
    for f1,p,r in scores:
        f1_sum += f1
        p_sum += p
        r_sum += r
    f1_sum /= len(scores)
    p_sum /= len(scores)
    r_sum /= len(scores)

    print (f'Concepts mean f1 {f1_sum}, precision {p_sum}, recall {r_sum}')
    print ('\n')
    return f1_sum,p_sum,r_sum,scores

# concepts_report = do_concepts_eval(gold_df,gen_df)

In [533]:
def get_f1_p_r(tp,fp,fn, tag ="calculated"):
    if DEBUG:
        print (f"{tag} tp: {tp}, fp: {fp}, fn: {fn}")
    try:
        p = tp/(tp + fp)
    except:
        p = 0
    try:
        r = tp/(tp + fn)
    except:
        r = 0
    try:
        f1 = 2 * ((p * r)/(p+r))
    except:
        f1 = 0
    if DEBUG:
        print (f'{tag} f1 {f1}, precision {p}, recall {r}')
    return f1,p,r

In [534]:
def do_pmids_eval(gold_df,gen_df):
    print("PubMed Documents Evaluation")
    # pmids are the pubmed document ids
    gold_ids,gold_pmids,gen_ids,gen_pmids = get_col_list(gold_df,gen_df,'documents')
    num_gen_q_without_docs = 0
    num_gold_q_without_docs = 0
    tp = 0
    fp = 0
    fn = 0

    scores = []
    # for each question
    for i in range (len(gold_ids)):
        gold_val = gold_pmids[i]
        if gold_val == []:
            num_gold_q_without_docs += 1
            continue
        gen_val = gen_pmids[gen_ids.index(gold_ids[i])]
        # if documents are found
        if gen_val != []:
            # TP is pmid in Gold AND Gen
            # FP is pmid NOT IN GOLD, but YES IN GEN
            # FN is pmid IN Gold but NOT GEN

            # get unique PMIDs from both gold and gen
            unique_gold_pmids = set(gold_val[0])
            unique_gen_pmids = set(gen_val[0])
            for val in unique_gold_pmids:
                if val in unique_gen_pmids:
                    tp += 1
                elif val not in unique_gen_pmids:
                    fn += 1
            for val in unique_gen_pmids:
                if val not in unique_gold_pmids:
                    fp += 1

            f1,p,r = get_f1_p_r(tp,fp,fn, tag ="PubMed Documents")
            scores.append((f1,p,r))
        else: # There are no documents retrieved for this document
            num_gen_q_without_docs += 1
            pass
    #sanity check
    print(f"[{len(gold_ids) - num_gold_q_without_docs}/{len(gold_ids)}] Questions have documents in gold dataset")
    print(f"[{len(gen_ids) - num_gen_q_without_docs}/{len(gen_ids)}] Questions have documents in generated dataset")

    # OVERALL SCORES
    f1_sum = p_sum = r_sum = 0
    for f1,p,r in scores:
        f1_sum += f1
        p_sum += p
        r_sum += r
    f1_sum /= len(scores)
    p_sum /= len(scores)
    r_sum /= len(scores)

    print (f'PubMed Documents mean f1 {f1_sum}, precision {p_sum}, recall {r_sum}')
    print ('\n')
    return f1_sum,p_sum,r_sum,scores

# pmid_report = do_pmids_eval(gold_df,gen_df)

In [535]:
# We use strict and leniant accuracy  (first result, or any result)
def do_factoid_eval(gold_df,gen_factoid_path):
    print("Factoid Evaluation")
    factoid_gold_df = gold_df[gold_df['type'] == 'factoid']
    factoid_gen_df = gen_df[gen_df['type'] == 'factoid']

    if DEBUG:
        print(f" [{len(factoid_gold_df)}] Gold Factoid Questions")
        print(f" [{len(factoid_gen_df)}] Generated Factoid Questions")
    gold_ids,gold_ans,gen_ids,gen_ans = get_col_list(factoid_gold_df,gen_df,'exact_answer')

    # Use alternative strategy to handle ranked factoid preds
    with open(gen_factoid_path, "r") as ft_file:
        factoid_gen_json = json.load(ft_file)
    
    gen_factoid_answers = {}
    for question in factoid_gen_json["questions"]:
        id = question["id"]
        if len(id) == 24:
            id = id[0:20]
        answer = question['exact_answer']
        if isinstance(answer,list): 
            if isinstance(answer[0],list): # handle list in list
                answer = [e[0] for e in answer]
        gen_factoid_answers[id] = answer

    num_gold_q_without_ans = 0
    num_strict = 0
    num_leniant = 0
    num_total = 0
    mrrs = []
    # for each question
    for i in range (len(gold_ids)):
        gold_val = gold_ans[i][0]
        if gold_val == []:
            num_gold_q_without_ans += 1
            continue
        # trim last 4 digits which get removed for the final bioasq form answers
        trimmed_id = gold_ids[i][0:20]
        if trimmed_id not in gen_factoid_answers.keys():
            if DEBUG:
                print(f"{trimmed_id} wasn't correctly identified as factoid")
            continue
        gen_vals = gen_factoid_answers[trimmed_id]
        gen_vals_clean = [e.lower().strip() for e in gen_vals]
        if DEBUG:
            print(gold_val," | ",gen_vals)
        # accuracy calculations
        gold_val_clean = gold_val
        num_total += 1
        if gold_val_clean == gen_vals_clean[0]: # force lowercase / strip whitespace to help
            num_strict += 1
            num_leniant += 1
        elif gold_val_clean in gen_vals_clean:
            num_leniant += 1

        # mrr calculations
        mrr = 0
        r = 0
        n = len(gen_vals_clean)
        for i in range(1,n+1):
            if gen_vals_clean[i-1] == gold_val_clean:
                r = i
                break
        if r != 0:
            mrr = 1/n * 1/r 
            if DEBUG:
                print(f"{trimmed_id} MRR: {mrr}")
        mrrs.append(mrr)

    average_mrr = sum(mrrs) / len(mrrs)
    leniant_acc = num_leniant/num_total
    strict_acc = num_strict/num_total

    # sanity check
    print(f"[{len(gold_ids) - num_gold_q_without_ans}/{len(gold_ids)}] Factoid questions have answers in gold dataset")
    print(f"[{num_total}/{len(gen_factoid_answers)}] Factoid questions have answers in generated dataset")
    print(f"Leniant Accuracy: {leniant_acc}, Strict Accuracy: {strict_acc}, Mean Reciprocal Rank (MRR): {average_mrr}")
    print ('\n')
    return leniant_acc,strict_acc,average_mrr

# do_factoid_eval(gold_df,'tmp/qa/factoid/BioASQform_BioASQ-answer.json')

In [536]:
def do_list_eval(gold_df,gen_df):
    print("List Evaluation")
    list_gold_df = gold_df[gold_df['type'] == 'list']
    list_gen_df = gen_df[gen_df['type'] == 'list']

    if DEBUG:
        print(f" [{len(list_gold_df)}] Gold List Questions")
        print(f" [{len(list_gen_df)}] Generated List Questions")
    
    gold_ids,gold_ans,gen_ids,gen_ans = get_col_list(list_gold_df,gen_df,'exact_answer')
    num_gen_q_without_ans = 0
    num_gold_q_without_ans = 0
    tp = 0
    fp = 0
    fn = 0

    num_gen = 0
    scores = []
    # for each question
    for i in range (len(gold_ids)):
        gold_val = gold_ans[i]
        if gold_val == []:
            num_gold_q_without_ans += 1
            continue
        gen_val = gen_ans[gen_ids.index(gold_ids[i])]
        # if answers are found
        if gen_val != None: # List is only able to find a single item list
            # TP is answer in Gold AND Gen
            # FP is answer NOT IN GOLD, but YES IN GEN
            # FN is answer IN Gold but NOT GEN
            gold_list = gold_val[0]
            for val in gold_list:
                if val in gen_val:
                    tp += 1
                elif val not in gen_val:
                    fn += 1
            for val in gen_val:
                if val not in gold_list:
                    fp += 1
            num_gen += 1
            f1,p,r = get_f1_p_r(tp,fp,fn, tag ="List Questions")
            scores.append((f1,p,r))
    #sanity check
    print(f"[{len(gold_ids) - num_gold_q_without_ans}/{len(gold_ids)}] List questions have answers in gold dataset")
    print(f"[{num_gen}/{len(list_gen_df)}] List questions have answers in generated dataset")

    # OVERALL SCORES
    f1_sum = p_sum = r_sum = 0
    for f1,p,r in scores:
        f1_sum += f1
        p_sum += p
        r_sum += r
    f1_sum /= len(scores)
    p_sum /= len(scores)
    r_sum /= len(scores)

    print (f'List Questions mean f1 {f1_sum}, precision {p_sum}, recall {r_sum}')
    print ('\n')
    return f1_sum,p_sum,r_sum,scores

list_report = do_list_eval(gold_df,gen_df)


List Evaluation
[644/644] List questions have answers in gold dataset
[395/647] List questions have answers in generated dataset
List Questions mean f1 0.0007866310930120968, precision 0.00040072530991740955, recall 0.021403064121746397




In [538]:
# Set up the golden answer dataframe
golden_dataset_path = "testing_datasets/augmented_concepts_abstracts_titles.json"
generated_qu = "tmp/ir/output/bioasq_qa.xml"
with open(golden_dataset_path,'r') as f:
    gold_data = json.loads(f.read())
# load and flatten data
gold_df = pd.json_normalize(gold_data,record_path="questions")
# get gold df
gold_df['documents'] = gold_df['documents'].apply(get_pmid)
# get generated df

gen_df = parse_xml(generated_qu,'tmp/qa')

## Fully Generated
# Concepts
concepts_report = do_concepts_eval(gold_df,gen_df)
#print(concepts_report)

# Documents 
pmids_report = do_pmids_eval(gold_df,gen_df)
# print(pmids_report)

# Type
type_report = classification_report(gold_df['type'].to_numpy(),gen_df['type'].to_numpy(),output_dict=DEBUG)
# print(type_report)

# Yes/No Question Answering
yes_no_report = do_yes_no_eval(gold_df,gen_df)
#print(yes_no_report)

# Factoid Question Answering
factoid_report = do_factoid_eval(gold_df,'tmp/qa/factoid/BioASQform_BioASQ-answer.json')
#print(factoid_report)

# List Question Answering
list_report = do_list_eval(gold_df,gen_df)
# print(list_report)
## Mixed Gold

## All gold

Concepts Evaluation
[1607/3243] Questions have human readable concepts in gold dataset
[3242/3243] Questions have human readable concepts in generated dataset
Concepts mean f1 0.507613033876486, precision 0.6096832682725072, recall 0.43495543509698553


PubMed Documents Evaluation
[3243/3243] Questions have documents in gold dataset
[1947/3243] Questions have documents in generated dataset
PubMed Documents mean f1 0.5822856764616713, precision 0.5808496837782734, recall 0.5837572783823382


Yes/No Evaluation
Yes | True Posative: 0, False Posative: 0, False Negative: 427
Yes | f1 0, precision 0, recall 0.0
No | True Posative: 78, False Posative: 427, False Negative: 0
No | f1 0.26758147512864494, precision 0.15445544554455445, recall 1.0
Overall Yes/No | f1 0.13379073756432247, precision 0.07722772277227723, recall 0.5


Factoid Evaluation
[941/941] Factoid questions have answers in gold dataset
[460/493] Factoid questions have answers in generated dataset
Leniant Accuracy: 0.0, Strict 