# Results of Manual Evaluation of Relationship Predictions

In [1]:
import pandas as pd
import numpy as np

All evaluations keyed by evaluator:

In [2]:
EVALS = {"cjm": ["uberon"], "st": ["hp", "mp"]}

Load the results of the manual evaluation of relationship predictions.

Note each TSV only has Eval filled for the ontologies evaluated by the respective evaluator,
so we retain only those rows.

We also keep a representative `last_df` for ad-hoc calculation of summary statistics from pre-set
values.

In [3]:
dfs = []
last_df = None
for k, onts in EVALS.items():
    df = pd.read_csv(f"results/manual-relation-eval/rel-eval.{k}.tsv", sep="\t")
    last_df = df.copy()
    df = df[df["ontology"].isin(onts)]
    dfs.append(df)

In [4]:
last_df.fillna("").groupby(["ontology", "outcome", "qualifier"]).agg({"outcome": "count"})

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,outcome
ontology,outcome,qualifier,Unnamed: 3_level_1
cl,false_negative,,53
cl,false_negative,prediction_is_more_general,12
cl,false_positive,,57
cl,true_positive,,40
envo,false_negative,,20
envo,false_negative,prediction_is_more_general,3
envo,false_positive,,18
envo,true_positive,,39
foodon,false_negative,,19
foodon,false_negative,prediction_is_more_general,8


In [5]:
last_df[last_df["outcome"] == "false_negative"].size

7455

In [6]:
last_df[last_df["qualifier"] == "prediction_is_more_general"].size


1365

In [7]:
last_df[last_df["qualifier"] == "prediction_is_more_general"].size / last_df[last_df["outcome"] == "false_negative"].size

0.18309859154929578

In [8]:
df = pd.concat(dfs)

In [9]:
df

Unnamed: 0,ontology,term_id,term_label,pred,outcome,qualifier,expected_tgt,predicted_tgt,review,notes,predicted_tgt_exists,Eval,"If new term, is it valid?",Issue URL,Notes
417,uberon,UBERON:0006562,pharynx,OnlyInTaxon,false_negative,,Bilateria,,,TC,True,,,,
418,uberon,UBERON:0006562,pharynx,subClassOf,false_positive,,,OrganismPart,,,False,TP,Yes,,
419,uberon,UBERON:0006562,pharynx,subClassOf,false_negative,,SubdivisionOfDigestiveTract,,,,True,,,,
420,uberon,UBERON:8480060,paraspinal region,OnlyInTaxon,false_negative,,Vertebrata_vertebrates_,,,,True,,,,
421,uberon,UBERON:8480060,paraspinal region,subClassOf,false_negative,,PosteriorRegionOfBody,,,,True,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
617,mp,MP:0031474,increased female germ cell apoptosis,subClassOf,true_positive,,AbnormalFemaleGermCellApoptosis,AbnormalFemaleGermCellApoptosis,OK,,True,,,,
618,mp,MP:0014278,semilobar holoprosencephaly,subClassOf,true_positive,,Holoprosencephaly,Holoprosencephaly,OK,,True,,,,
619,mp,MP:0014246,decreased cellular ATP level,subClassOf,true_positive,,AbnormalCellularATPLevel,AbnormalCellularATPLevel,OK,,True,,,,
620,mp,MP:0014246,decreased cellular ATP level,subClassOf,false_positive,,,DecreasedATPLevel,correct,This term is not in the 2023-08-09/mo.owl onto...,False,,,,


In [10]:
OUTCOME_NORM = {
    "TP": "true_positive",
    "FP": "false_positive",
    "FN": "false_negative",
    "TN": "true_negative",
    "TP*": "true_positive",
    "ENTAILED": "true_positive",
}
df["normalized_eval_outcome"] = df["Eval"].map(OUTCOME_NORM)

In [11]:
df["normalized_eval_outcome"] = df.apply(lambda row: OUTCOME_NORM.get(row['Eval'], row['outcome']), axis=1)
df

Unnamed: 0,ontology,term_id,term_label,pred,outcome,qualifier,expected_tgt,predicted_tgt,review,notes,predicted_tgt_exists,Eval,"If new term, is it valid?",Issue URL,Notes,normalized_eval_outcome
417,uberon,UBERON:0006562,pharynx,OnlyInTaxon,false_negative,,Bilateria,,,TC,True,,,,,false_negative
418,uberon,UBERON:0006562,pharynx,subClassOf,false_positive,,,OrganismPart,,,False,TP,Yes,,,true_positive
419,uberon,UBERON:0006562,pharynx,subClassOf,false_negative,,SubdivisionOfDigestiveTract,,,,True,,,,,false_negative
420,uberon,UBERON:8480060,paraspinal region,OnlyInTaxon,false_negative,,Vertebrata_vertebrates_,,,,True,,,,,false_negative
421,uberon,UBERON:8480060,paraspinal region,subClassOf,false_negative,,PosteriorRegionOfBody,,,,True,,,,,false_negative
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
617,mp,MP:0031474,increased female germ cell apoptosis,subClassOf,true_positive,,AbnormalFemaleGermCellApoptosis,AbnormalFemaleGermCellApoptosis,OK,,True,,,,,true_positive
618,mp,MP:0014278,semilobar holoprosencephaly,subClassOf,true_positive,,Holoprosencephaly,Holoprosencephaly,OK,,True,,,,,true_positive
619,mp,MP:0014246,decreased cellular ATP level,subClassOf,true_positive,,AbnormalCellularATPLevel,AbnormalCellularATPLevel,OK,,True,,,,,true_positive
620,mp,MP:0014246,decreased cellular ATP level,subClassOf,false_positive,,,DecreasedATPLevel,correct,This term is not in the 2023-08-09/mo.owl onto...,False,,,,,false_positive


In [12]:
df.to_csv("all_outcomes.csv")

In [13]:
def himax(tbl):
    """
    Prepare a table for publication, highlighting maximum values
    """
    tbl_subset = tbl.select_dtypes(include=[np.number]).columns
    return tbl.style.hide().highlight_max(subset=tbl_subset, axis=0, props='font-weight:bold').format(precision=3)

In [14]:



def compute_metrics_new(input_df: pd.DataFrame, ignore_ungrounded=True, use_partial=True, preds=None, group_by=None, partial_fn_factor=None) -> pd.DataFrame:
    """
    Compute precision, recall, and F1 score grouped by model_name, method, and ontology.
    
    Parameters:
    - df: DataFrame containing the input data with columns 'model_name', 'method', 'ontology', 'tp', 'fp', 'fn'
    
    Returns:
    - DataFrame with aggregated results.
    """
    if ignore_ungrounded:
        input_df = input_df.query("predicted_tgt_exists == True")
    if use_partial:
        # set outcome to 'partial_false_negative' if (1)
        # outcome is 'false_negative' and (2) the qualifier is 'partial'
        input_df.loc[(input_df['outcome'] == 'false_negative') & (input_df['qualifier'] == 'prediction_is_more_general'), 'outcome'] = 'partial_false_negative'
    # filter by pred
    if preds is not None:
        input_df = input_df[input_df['pred'].isin(preds)]

    def calculate_metrics_for_group(group):
        tp = len(group[group['outcome'] == 'true_positive'])
        fp = len(group[group['outcome'] == 'false_positive'])
        fn = len(group[group['outcome'] == 'false_negative'])
        tn = len(group[group['outcome'] == 'true_negative'])
        
        if partial_fn_factor is None:
            partials = group[group['outcome'] == 'partial_false_negative']
            sum_ic_ratio = partials['ic_ratio'].sum()
            # Calculate the sum of all (1 - 'ic_ratio') values
            sum_inverse_ic_ratio = (1 - partials['ic_ratio']).sum()
            tp += sum_ic_ratio
            fn += sum_inverse_ic_ratio
        else:
            tp += len(group[group['outcome'] == 'partial_false_negative']) * (1-partial_fn_factor)
            fn += len(group[group['outcome'] == 'partial_false_negative']) * partial_fn_factor
        
    
        precision = tp / (tp + fp) if (tp + fp) > 0 else 0
        recall = tp / (tp + fn) if (tp + fn) > 0 else 0
        accuracy = (tp + tn) / (tp + tn + fp + fn) if (tp + tn + fp + fn) > 0 else 0
        f1_score = 2 * (precision * recall) / (precision + recall) if (precision + recall) > 0 else 0
    
    
        return pd.Series({'precision': precision, 'recall': recall, 'f1': f1_score})
    
    if group_by is None:
        group_by = ['ontology']
    # Group by 'model_name', 'method', and 'ontology' and aggregate tp, fp, and fn
    grouped = input_df.groupby(['model_name', 'method'] + group_by)
    metrics = grouped.apply(calculate_metrics_for_group)

    return metrics.reset_index()


In [15]:
df["model_name"] = "gpt-4"
df["method"] = "RAG"

In [16]:
#himax(compute_metrics_new(df, group_by=[]))

In [17]:
himax(compute_metrics_new(df, group_by=["ontology"]))

KeyError: 'ic_ratio'

In [None]:
compute_metrics_new(df, group_by=["ontology"], outcome_col="normalized_eval_outcome")
