In [308]:
from lxml import etree as et
from sklearn.metrics import classification_report
import numpy
import os
import pandas as pd
import json
import csv

from utils import *

DEBUG = False

In [309]:
def get_pmid(docs):
    documents = [
        document.split("/")[-1] for document in docs
    ]
    return documents

In [310]:
def get_answers(answers_files):
    all_answers = dict()
    for a in answers_files:
        with open(a, "r") as f:
            d = json.loads(f.read())
            if DEBUG:
                print(f"{len(d)} answers found in {a}")
            for key, value in d.items():
                if key in all_answers.keys():
                    print(f"MULTIPLE ANSWERS FOR {key}")
                if isinstance(value,list):
                    all_answers[key] = value[0] # get the first value which is answer not prediction for the yes/no
                else:
                    all_answers[key] = value
    return all_answers

def get_three_files(a_dir):
    return [
        a_dir + "/factoid/predictions.json",
        a_dir + "/list/predictions.json",
        a_dir + "/yesno/predictions.json",
    ]

def get_col_list(gold_df,gen_df,col):
    gold_col = gold_df.loc[:,['id',col]].copy()
    gen_col = gen_df.loc[:,['id',col]].copy() 
    
    gold = gold_col.to_dict(orient='list')
    gen = gen_col.to_dict(orient='list')
    gen_ids = gen['id']
    gen_vals = gen[col]
    gold_ids = gold['id']
    gold_vals = gold[col]
    return gold_ids,gold_vals,gen_ids,gen_vals

In [311]:

def parse_xml(xml_file, dir_for_qa): 
    no_answers = 0
    # get answers
    qa_answers = get_answers(get_three_files(dir_for_qa))
    # get ir and qu
    df_cols = ['id','human_concepts','documents','full_abstracts','titles','type', 'exact_answer']
    xtree = et.parse(xml_file)
    xroot = xtree.getroot()
    rows = []
    for question in xroot: 
        id = question.attrib.get("id")
        ir = question.find("IR")
        qp = question.find("QP")
        concepts = [e.text for e in qp.findall("Entities")]
        qa_type = qp.find("Type").text
        titles =  [e.find("Title").text for e in ir.findall("Result")]
        abstracts =  [e.find("Abstract").text for e in ir.findall("Result")]
        pmids = [e.get("PMID") for e in ir.findall("Result")]
        exact_answer = qa_answers[id] if id in qa_answers else None
        if DEBUG and not exact_answer:
            print(f"id [{id}] has no answer")
            no_answers +=1
        rows.append({"id":id,"human_concepts":concepts,"documents":pmids,"full_abstracts":abstracts,"titles":titles,"type":qa_type,'exact_answer':exact_answer})
    out_df = pd.DataFrame(rows, columns=df_cols)
    if DEBUG:
        print(f"[{no_answers}/{len(out_df)}] questions had answers")
    return out_df


In [312]:
# Set up the golden answer dataframe
golden_dataset_path = "testing_datasets/augmented_concepts_abstracts_titles.json"
generated_qu = "tmp/ir/output/bioasq_qa.xml"
with open(golden_dataset_path,'r') as f:
    gold_data = json.loads(f.read())
# load and flatten data
gold_df = pd.json_normalize(gold_data,record_path="questions")
# get gold df
gold_df['documents'] = gold_df['documents'].apply(get_pmid)
# get generated df
gen_df = parse_xml(generated_qu,'tmp/qa')

gen_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3243 entries, 0 to 3242
Data columns (total 7 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   id              3243 non-null   object
 1   human_concepts  3243 non-null   object
 2   documents       3243 non-null   object
 3   full_abstracts  3243 non-null   object
 4   titles          3243 non-null   object
 5   type            3243 non-null   object
 6   exact_answer    1766 non-null   object
dtypes: object(7)
memory usage: 177.5+ KB


In [313]:
def print_yes_no_info(df, tag):
    print(tag)
    print(f" [{len(df)}] {tag} Yes/No Questions")
    yes_df = df[df['exact_answer'] == 'yes']
    no_df = df[df['exact_answer'] == 'no']
    print(f" [{len(yes_df)}] {tag} Yes Questions")
    print(f" [{len(no_df)}] {tag} No Questions")


""" f1 Yes
    tp is gen 'yes' | gold 'yes'
    fp is gen 'yes' | gold 'no'
    fn is gen 'no' |  gold 'yes'

    f1 No
    tp is gen 'no' | gold 'no'
    fp is gen 'no' | gold 'yes'
    fn is gen 'yes' |  gold 'no'

    IGNORE if the predicted type is yes/no but gold type is different
"""
def do_yes_no_eval(gold_df,gen_df):
    print("Yes/No Evaluation")
    yes_no_gold_df = gold_df[gold_df['type'] == 'yesno']
    yes_no_gen_df = gen_df[gen_df['type'] == 'yesno']
    
    # Gold stats
    print_yes_no_info(yes_no_gold_df, "Gold")
    # Gen Stats
    print_yes_no_info(yes_no_gen_df, "Generated")

    gold_ids,gold_ans,gen_ids,gen_ans = get_col_list(yes_no_gold_df,gen_df,'exact_answer')

    # YES 
    print(f"yes f1 eval ({len(gold_ids)}) ({len(gen_ids)})")
    ytp = 0
    yfp = 0
    yfn = 0

    for i in range (len(gold_ids)):
        gold_val = gold_ans[i]
        gen_val = gen_ans[gen_ids.index(gold_ids[i])]
        if gen_val:
            if gold_val == 'yes':
                if gen_val =='yes':
                    ytp += 1
                elif gen_val =='no':
                    yfn += 1
                else:
                    print(f"yes question [{gold_ids[i]}] had generated answer {gen_val}")
            elif gold_val == 'no':
                if gen_val == 'yes':
                    yfp +=1
                elif gen_val =='no':
                    pass #handled by no f1
                else:
                    print(f"no question [{gold_ids[i]}] had generated answer {gen_val}")
            else:
                print(f"GOLDEN answer to yes/no question [{gold_ids[i]}] was {gold_val}")
                
        else: # not identified as yes/no question by generated
            pass
    #sanity check
    print (f"ytp: {ytp}, yfp: {yfp}, yfn: {yfn}")
    try:
        yp = ytp/(ytp + yfp)
    except:
        yp = 0
    try:
        yr = ytp/(ytp + yfn)
    except:
        yr = 0
    try:
        yf1 = 2 * ((yp * yr)/(yp+yr))
    except:
        yf1 = 0
    print (f'Yes f1 {yf1}, precision {yp}, recall {yr}')

    # NO SIDE
    print("no f1 eval")
    ntp = 0
    nfp = 0
    nfn = 0

    for i in range (len(gold_ids)):
        gold_val = gold_ans[i]
        gen_val = gen_ans[gen_ids.index(gold_ids[i])]
        if gen_val:
            if gold_val == 'no':
                if gen_val =='no':
                    ntp += 1
                elif gen_val =='yes':
                    nfn += 1
                else:
                    print(f"no question [{gold_ids[i]}] had generated answer {gen_val}")
            elif gold_val == 'yes':
                if gen_val == 'no':
                    nfp +=1
                elif gen_val =='yes':
                    pass #handled by no f1
                else:
                    print(f"yes question [{gold_ids[i]}] had generated answer {gen_val}")
            else:
                print(f"GOLDEN answer to yes/no question [{gold_ids[i]}] was {gold_val}")
                
        else: # not identified as yes/no question by generated
            pass

    # sanity check
    print (f"ntp: {ntp}, nfp: {nfp}, yfn: {nfn}")
    try:
        np = ntp/(ntp + nfp)
    except:
        np = 0
    try:
        nr = ntp/(ntp + nfn)
    except:
        nr = 0
    try:
        nf1 = 2 * ((np * nr)/(np+nr))
    except:
        nf1 = 0
    print (f'No f1 {nf1}, precision {np}, recall {nr}')

    f1 = (yf1 + nf1)/2 
    print(f"Overall Yes/No f1 score {f1}")
    return yf1,yp,yr,nf1,np,nr,f1


In [314]:
# Compute Mean average precision, Geometric mean average precision, precision,recall, f1 score
def do_concepts_eval(gold_df,gen_df):

    gold_ids,gold_cons,gen_ids,gen_cons = get_col_list(gold_df,gen_df,'human_concepts')



concepts_report = do_concepts_eval(gold_df,gen_df)
print (concepts_report)

None


In [315]:
def get_f1_p_r(tp,fp,fn, tag ="calculated"):
    if DEBUG:
        print (f"{tag} tp: {tp}, fp: {fp}, fn: {fn}")
    try:
        p = tp/(tp + fp)
    except:
        p = 0
    try:
        r = tp/(tp + fn)
    except:
        r = 0
    try:
        f1 = 2 * ((p * r)/(p+r))
    except:
        f1 = 0
    if DEBUG:
        print (f'{tag} f1 {f1}, precision {p}, recall {r}')
    return f1,p,r

In [316]:
def do_pmids_eval(gold_df,gen_df):
    # pmids are the pubmed document ids
    gold_ids,gold_pmids,gen_ids,gen_pmids = get_col_list(gold_df,gen_df,'documents')
    print(f"yes f1 eval ({len(gold_ids)}) ({len(gen_ids)})")
    num_gen_q_without_docs = 0
    num_gold_q_without_docs = 0
    tp = 0
    fp = 0
    fn = 0

    scores = []
    # for each question
    for i in range (len(gold_ids)):
        gold_val = gold_pmids[i]
        if gold_val == []:
            num_gold_q_without_docs += 1
            continue
        gen_val = gen_pmids[gen_ids.index(gold_ids[i])]
        # if documents are found
        if gen_val != []:
            # TP is pmid in Gold AND Gen
            # FP is pmid NOT IN GOLD, but YES IN GEN
            # FN is pmid IN Gold but NOT GEN

            # get unique PMIDs from both gold and gen
            unique_gold_pmids = set(gold_val[0])
            unique_gen_pmids = set(gen_val[0])
            for val in unique_gold_pmids:
                if val in unique_gen_pmids:
                    tp += 1
                elif val not in unique_gen_pmids:
                    fn += 1
            for val in unique_gen_pmids:
                if val not in unique_gold_pmids:
                    fp += 1

            f1,p,r = get_f1_p_r(tp,fp,fn, tag ="PubMed Documents")
            scores.append((f1,p,r))
        else: # There are no documents retrieved for this document
            num_gen_q_without_docs += 1
            pass
    #sanity check
    print(f"[{len(gold_ids) - num_gold_q_without_docs}/{len(gold_ids)}] Questions have documents in gold dataset")
    print(f"[{len(gen_ids) - num_gen_q_without_docs}/{len(gen_ids)}] Questions have documents in generated dataset")

    # OVERALL SCORES
    f1_sum = p_sum = r_sum = 0
    for f1,p,r in scores:
        f1_sum += f1
        p_sum += p
        r_sum += r
    f1_sum /= len(scores)
    p_sum /= len(scores)
    r_sum /= len(scores)

    print (f'PubMed Documents mean f1 {f1_sum}, precision {p_sum}, recall {r_sum}')
    return f1_sum,p_sum,r_sum,scores

pmid_report = do_pmids_eval(gold_df,gen_df)

yes f1 eval (3243) (3243)
[3243/3243] Questions have documents in gold dataset
[1947/3243] Questions have documents in generated dataset
PubMed Documents mean f1 0.5822856764616713, precision 0.5808496837782734, recall 0.5837572783823382


In [329]:
# We use strict and leniant accuracy  (first result, or any result)
def do_factoid_eval(gold_df,gen_factoid_path):
    factoid_gold_df = gold_df[gold_df['type'] == 'factoid']
    factoid_gen_df = gen_df[gen_df['type'] == 'factoid']

    if DEBUG:
        print(f" [{len(factoid_gold_df)}] Gold Factoid Questions")
        print(f" [{len(factoid_gen_df)}] Generated Factoid Questions")
    gold_ids,gold_ans,gen_ids,gen_ans = get_col_list(factoid_gold_df,gen_df,'exact_answer')

    # Use alternative strategy to handle ranked factoid preds
    with open(gen_factoid_path, "r") as ft_file:
        factoid_gen_json = json.load(ft_file)
    
    gen_factoid_answers = {}
    for question in factoid_gen_json["questions"]:
        id = question["id"]
        if len(id) == 24:
            id = id[0:20]
        answer = question['exact_answer']
        if isinstance(answer,list): 
            if isinstance(answer[0],list): # handle list in list
                answer = [e[0] for e in answer]
        gen_factoid_answers[id] = answer

    num_gold_q_without_ans = 0
    num_gen_q_without_ans = 0
    num_right = 0
    # for each question
    for i in range (len(gold_ids)):
        gold_val = gold_ans[i][0]
        if gold_val == []:
            num_gold_q_without_ans += 1
            continue
        # trim last 4 digits which get removed for the final bioasq form answers
        trimmed_id = gold_ids[i][0:20]
        if trimmed_id not in gen_factoid_answers.keys():
            if DEBUG:
                print(f"no generated question for {trimmed_id}")
            num_gen_q_without_ans += 1
            continue
        gen_vals = gen_factoid_answers[trimmed_id]
        if DEBUG:
            print(gold_val," | ",gen_vals)
        # if answer is found
        if gen_vals != None:
            unique_gold_pmids = gold_val[0]
            unique_gen_pmids = gen_val[0]

    print (f"[{num_gen_q_without_ans}] questions were unable to reach QA step")
    #sanity check
    print(f"[{len(gold_ids) - num_gold_q_without_ans}/{len(gold_ids)}] Questions have answers in gold dataset")
    print(f"[{len(gen_ids) - num_gen_q_without_ans - num_empty_preds}/{len(gen_ids)}] Questions have answers in generated dataset (total - no answer - not reaching QA)")

    return
do_factoid_eval(gold_df,'tmp/qa/factoid/BioASQform_BioASQ-answer.json')

Bazex syndrome  |  ['one of the rarer cutaneous paraneoplastic syndromes. It is characterized', 's. It is characterized', 'neoplastic syndromes. It is characterized', 'tosis paraneoplastica of Bazex and report three cases which illustrate both the typical and', 'neoplastica of Bazex and report three cases which illustrate both the typical and']
castration-resistant prostate cancer  |  ['teronel with docetaxel', 'teronel with docetaxel. Recently, the ELM-PC5 Phase III clinical trial in patients with', ', a marker', 'l with docetaxel', 'well tolerated with fatigue']
plasma membrane  |  ['pore and adenosine generated by ectonucleotidase-dependent dephos', 'pore and adenosine generated by ectonucleotidase-dependent dephosphorylation of ATP', 'X', 'knowledge and hypotheses about interactions of Panx1 channels with', '-methyl-D-aspartate (NMDA) receptor channels. Activation of these receptor channels by their']
Hydrophilic Interaction Chromatography  |  [', the HILIC enantioseparations carri

In [318]:
def do_list_eval(gold_df,gen_df):
    return

In [319]:
## Fully Generated
# Concepts
# NEED TO DO
concepts_report = do_concepts_eval(gold_df,gen_df)
print(concepts_report)

# Documents 
pmids_report = do_pmids_eval(gold_df,gen_df)
print(pmids_report)

# Type
type_report = classification_report(gold_df['type'].to_numpy(),gen_df['type'].to_numpy(),output_dict=DEBUG)
print(type_report)

# Yes/No Question Answering
yes_no_report = do_yes_no_eval(gold_df,gen_df)
print(yes_no_report)

# Factoid Question Answering
factoid_report = do_factoid_eval(gold_df,'tmp/qa/factoid/BioASQform_BioASQ-answer.json')
print(factoid_report)
# List Question Answering
list_report = do_list_eval(gold_df,gen_df)
print(list_report)
## Mixed Gold

## All gold

None
yes f1 eval (3243) (3243)
[3243/3243] Questions have documents in gold dataset
[1947/3243] Questions have documents in generated dataset
PubMed Documents mean f1 0.5822856764616713, precision 0.5808496837782734, recall 0.5837572783823382
(0.5822856764616713, 0.5808496837782734, 0.5837572783823382, [(0.6, 0.6, 0.6), (0.5, 0.45454545454545453, 0.5555555555555556), (0.45161290322580644, 0.4375, 0.4666666666666667), (0.46511627906976744, 0.45454545454545453, 0.47619047619047616), (0.4727272727272727, 0.4642857142857143, 0.48148148148148145), (0.4477611940298507, 0.4411764705882353, 0.45454545454545453), (0.4675324675324675, 0.47368421052631576, 0.46153846153846156), (0.4545454545454546, 0.46511627906976744, 0.4444444444444444), (0.5294117647058824, 0.54, 0.5192307692307693), (0.5391304347826087, 0.5535714285714286, 0.5254237288135594), (0.5555555555555557, 0.5645161290322581, 0.546875), (0.5547445255474452, 0.5588235294117647, 0.5507246376811594), (0.5369127516778522, 0.54054054054054