# DRAGON relationships pre-processing

See [Makefile](Makefile) for how the results were generated

In [7]:
# initial imports
import warnings
warnings.filterwarnings('ignore', category=FutureWarning)

import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns
from oaklib import get_adapter

from typing import List, Dict

from pathlib import Path

import yaml
import glob

In [2]:
def connect(ont: str) -> pd.DataFrame:
    """
    Connect to an ontology via OAK/sqlite
    
    Assumes already in db folder
    :param ont: 
    :return: 
    """
    return get_adapter(f"downloads/{ont}.db")


In [3]:
from linkml_runtime.utils.formatutils import camelcase
from collections import defaultdict
from functools import lru_cache


@lru_cache
def labels2ids(ont: str) -> Dict[str, List[str]]:
    """
    Get a map of labels to ids for an ontology
    
    :param ont: 
    :return: 
    """
    m = defaultdict(list)
    adapter = connect(ont)
    for id, lbl in adapter.labels(list(adapter.entities())):
        if not lbl:
            continue
        lbl = camelcase(lbl)
        m[lbl].append(id)
    return m

cl_map = labels2ids("cl")
len(cl_map)

16012

In [4]:
def revmap(m: Dict[str, List[str]]) -> Dict[str, str]:
    """
    Reverse a one to many map into a one to one map
    
    :param m: 
    :return: 
    """
    r = {}
    for k, v in m.items():
        for x in v:
            r[x] = k
    return r

In [10]:
from copy import copy
from pydantic import BaseModel


class Outcome(BaseModel):
    """
    Model the outcome of the prediction for a particular term
    
    """
    tp: int = 0
    fp: int = 0
    fn: float = 0
    other: List = []
    
    tp_list: List = []
    fp_list: List = []
    fn_list: List = []
    
    new_terms: List[str] = []
    
    rows: List[Dict] = []
    
    def matches(self, other: 'Outcome') -> bool:
        return self.tp == other.tp and self.fp == other.fp and self.fn == other.fn and self.other == other.other

In [14]:

from oaklib.interfaces import OboGraphInterface
from oaklib.datamodels.vocabulary import IS_A
from oaklib.utilities.obograph_utils import graph_to_image, default_stylemap_path

def score_rels(expected_rels, predicted_rels, ont, term_id:str=None, strict=False, base="default", ignore_ungrounded=True) -> Outcome:
    outcome = Outcome()
    labelmap = labels2ids(ont)
    idmap = revmap(labelmap)
    new_terms = set()
    adapter = get_adapter(f"sqlite:obo:{ont}")
    if not isinstance(adapter, OboGraphInterface):
        raise Exception(f"Only OboGraphInterface supported")
    all_preds = set(expected_rels.keys()).union(set(predicted_rels.keys()))
    all_predicted_tgts = set()
    for pred in all_preds:
        if pred.lower() == 'subclassof':
            preds = [IS_A]
        else:
            pred_ids = labelmap.get(pred)
            if not pred_ids:
                pred_ids = [ids[0] for lbl, ids in labelmap.items() if pred.lower() == lbl.lower()]
            if not pred_ids:
                print(f"MISSING: {pred}")
                continue
                preds = [IS_A]
            else:
                preds = [IS_A, *pred_ids]
        preds = list(set(preds))
        def lbl2id(lbl):
            return labelmap.get(lbl, [lbl])
        expected_tgts = {x: lbl2id(x) for x in expected_rels.get(pred, [])}
        predicted_tgts = {x: lbl2id(x) for x in predicted_rels.get(pred, [])}
        for xs in predicted_tgts.values():
            all_predicted_tgts.update(xs)
        new_terms.update({x for x in predicted_tgts.keys() if x not in labelmap})
        #print(f"PRED: {pred} // predicted= {predicted_tgts} // expected= {expected_tgts}")
        
        predicted_accounted_for = set()
        expected_accounted_for = set()
        rows = []
        for expected_tgt in expected_tgts:
            row = {"term_id": term_id,
                   "term_label": adapter.label(term_id),
                   "pred": pred, 
                   "expected_tgt": expected_tgt}
            if expected_tgt in predicted_tgts:
                #print(f"TP: {pred} {expected_tgt}")
                outcome.tp += 1
                outcome.tp_list.append((pred, expected_tgt))
                predicted_accounted_for.add(expected_tgt)
                expected_accounted_for.add(expected_tgt)
                row["outcome"] = "true_positive"
                rows.append(row)
            else:
                ok = False
                
                for expected_tgt_id in expected_tgts.get(expected_tgt, []):
                    ancs = adapter.ancestors(expected_tgt_id, preds)
                    ancs_as_lbls = [idmap.get(x,x) for x in ancs]
                    more_general_predictions = [x for x in ancs_as_lbls if x in predicted_tgts]
                    if more_general_predictions:
                        #print(f"n/a PREDICTED_IS_MORE_GENERAL: {pred} {expected_tgt} // more_general_preds: {more_general_predictions}")
                        predicted_accounted_for.update(more_general_predictions)
                        expected_accounted_for.add(expected_tgt)
                        if strict:
                           outcome.fn += 1
                           outcome.fn_list.append((pred, expected_tgt))
                           row["outcome"] = "fn"
                        else:
                            outcome.fn += 0.5
                            for p in more_general_predictions:
                                outcome.other.append((pred, p, "PredictedMoreGeneralThan", expected_tgt))
                            row["outcome"] = "tp[more_general]"
                        ok = True
                        rows.append(row)
                        break
        for predicted_tgt in predicted_tgts:
            row = {"term_id": term_id,
                   "term_label": adapter.label(term_id),
                   "pred": pred, 
                   "expected_tgt": expected_tgt}
            if predicted_tgt in predicted_accounted_for:
                continue
            if predicted_tgt not in labelmap:
                if ignore_ungrounded:
                    continue
            if term_id:
                ancs = adapter.ancestors(term_id, preds)
                ancs_as_lbls = [idmap.get(x, x) for x in ancs]
                #print(f"CHECKING {term_id} {pred} {predicted_tgt} // ancs: {ancs_as_lbls}")
                if predicted_tgt in ancs_as_lbls:
                    outcome.tp += 1
                    outcome.tp_list.append((pred, predicted_tgt))
                    continue
            ok = False
            row["outcome"] = "fp"
            for predicted_tgt_id in predicted_tgts.get(predicted_tgt, []):
                ancs = adapter.ancestors(predicted_tgt_id, preds)
                ancs_as_lbls = [idmap.get(x, x) for x in ancs]
                #print(pred, predicted_tgt_id, ancs_as_lbls)
                more_general_expected = [x for x in ancs_as_lbls if x in expected_tgts]
                if more_general_expected:
                    #print(f"?? PREDICTED_IS_MORE_SPECIFIC: {pred} {expected_tgt} // more_general_expected: {more_general_expected}")
                    expected_accounted_for.update(more_general_expected)
                    if strict:
                        outcome.fp += 1
                        outcome.fp_list.append((pred, predicted_tgt))
                    else:
                        for e in more_general_expected:
                            outcome.other.append((pred, predicted_tgt_id, "PredictedMoreSpecificThan", e))
                        #outcome.other.append(f"{pred} MoreSpecificThan {predicted_tgt} {more_general_expected}")
                    row["outcome"] = "tp[more_specific]"
                    ok = True
                    break
                rows.append(row)
            if not ok:
                #print(f"FP: {pred} {predicted_tgt}")
                outcome.fp += 1
                outcome.fp_list.append((pred, predicted_tgt))
        for expected_tgt in expected_tgts:
            if expected_tgt in expected_accounted_for:
                continue
            #print(f"FN: {pred} {expected_tgt}")
            outcome.fn += 1
            outcome.fn_list.append((pred, expected_tgt))
    outcome.new_terms = list(new_terms)
    outcome.rows = rows
    if term_id:
        seed = [term_id] + list(all_predicted_tgts)
        #print(seed)
        pred_ids = [IS_A]
        for pred_name in all_preds:
            next_pred_ids = labelmap.get(pred_name, [])
            pred_ids.extend(next_pred_ids)
        pred_ids = list(set(pred_ids))
        graph = adapter.ancestor_graph(seed, pred_ids)
        name = f"term-{term_id}"
        dirn = Path("terms") / base
        dirn.mkdir(exist_ok=True)
        imgfile = f"{name}.png"
        graph_to_image(
            graph,
            seeds=seed,
            stylemap=default_stylemap_path(),
            #configure=configure,
            imgfile=str(dirn / imgfile),
            #view=view,
            format="png",
        )
        mdfile = str(dirn / f"{name}.md")
        with open(mdfile, 'w') as outf:
            outf.write(f"## {term_id}: {adapter.label(term_id)} ({base})\n\n")
            outf.write(f"Outcome: tp: {outcome.tp} fp: {outcome.fp} fn: {outcome.fn}\n\n")
            outf.write(f"### True Positives\n")
            for pred, tgt in outcome.tp_list:
                outf.write(f"* {pred} {tgt}\n")
            outf.write(f"### False Positives\n")
            for pred, tgt in outcome.fp_list:
                outf.write(f"* {pred} {tgt}\n")
            outf.write(f"### False Negatives\n")
            for pred, tgt in outcome.fn_list:
                outf.write(f"* {pred} {tgt}\n")
            outf.write(f"### Other\n")
            for x in outcome.other:
                outf.write(f"* {x}\n")
            outf.write(f"### Predicted terms\n")
            for pred, tgts in predicted_rels.items():
                outf.write(f"* {pred}: {tgts}\n")
            outf.write(f"### Expected terms\n")
            for pred, tgts in expected_rels.items():
                outf.write(f"* {pred}: {tgts}\n")
            outf.write(f"### New terms\n")
            for term in new_terms:
                outf.write(f"* {term}\n")
            outf.write(f"### Graph\n")
            outf.write(f"![{term_id}]({imgfile})\n")
    return outcome
                
outcome = score_rels({"subClassOf": ["Cell"], "PartOf": ["Cerebellum"], "DevelopsFrom": ["StemCell"], "OnlyInTaxon": ["HomoSapiens"]}, {"subClassOf": ["Neuron"], "PartOf": ["Brain", "MadeUp"], "MadeUpRel": ["z", "y"], "OnlyInTaxon": ["HomoSapiens"]}, "cl", term_id="CL:0000540")
print(outcome) 
assert outcome.tp == 2
assert outcome.fp == 0
assert outcome.fn == 2.5
assert len(outcome.other) == 1

MISSING: MadeUpRel
tp=2 fp=0 fn=2.5 other=[('PartOf', 'Brain', 'PredictedMoreGeneralThan', 'Cerebellum')] tp_list=[('OnlyInTaxon', 'HomoSapiens'), ('subClassOf', 'Neuron')] fp_list=[] fn_list=[('subClassOf', 'Cell'), ('DevelopsFrom', 'StemCell')] new_terms=['MadeUp'] rows=[]


In [78]:
from pydantic import ConfigDict
from enum import Enum
from typing import Optional


class OutcomeType(str, Enum):
    TRUE_POSITIVE = "true_positive"
    FALSE_POSITIVE = "false_positive"
    FALSE_NEGATIVE = "false_negative"
    NULL = "null"
    
    
class QualifierType(str, Enum):
    PREDICTION_IS_MORE_GENERAL = "prediction_is_more_general"
    PREDICTION_IS_MORE_SPECIFIC = "prediction_is_more_specific"
    

class RelationshipEval(BaseModel):
    #model_config = ConfigDict(use_enum_values=True)
    ontology: Optional[str] = None
    method: Optional[str] = None
    term_id: str
    term_label: Optional[str] = None
    pred: Optional[str] = None
    outcome: Optional[OutcomeType] = None
    qualifier: Optional[QualifierType] = None
    expected_tgt: Optional[str] = None
    predicted_tgt: Optional[str] = None
    predicted_tgt_exists: bool = True
    
    def destrictify(self):
        if self.qualifier:
            self.outcome = OutcomeType.NULL
            
    def as_dict(self):
        obj = self.model_dump()
        obj["outcome"] = self.outcome.value
        obj["qualifier"] = self.qualifier.value if self.qualifier else None
        return obj
            

t = RelationshipEval(term_id="foo", outcome = OutcomeType.TRUE_POSITIVE)
print(t.as_dict())

{'ontology': None, 'method': None, 'term_id': 'foo', 'term_label': None, 'pred': None, 'outcome': 'true_positive', 'qualifier': None, 'expected_tgt': None, 'predicted_tgt': None, 'predicted_tgt_exists': True}


In [79]:
from oaklib.interfaces import OboGraphInterface
from oaklib.datamodels.vocabulary import IS_A
from oaklib.utilities.obograph_utils import graph_to_image, default_stylemap_path

def get_relationship_outcomes(expected_rels, predicted_rels, ont, term_id:str) -> List[RelationshipEval]:
    labelmap = labels2ids(ont)
    idmap = revmap(labelmap)
    new_terms = set()
    adapter = get_adapter(f"sqlite:obo:{ont}")
    if not isinstance(adapter, OboGraphInterface):
        raise Exception(f"Only OboGraphInterface supported")
    all_preds = set(expected_rels.keys()).union(set(predicted_rels.keys()))
    all_predicted_tgts = set()
    rows = []
    for pred in all_preds:
        # print(pred)
        if pred.lower() == 'subclassof':
            preds = [IS_A]
        else:
            pred_ids = labelmap.get(pred)
            if not pred_ids:
                pred_ids = [ids[0] for lbl, ids in labelmap.items() if pred.lower() == lbl.lower()]
            if not pred_ids:
                print(f"MISSING: {pred}")
                continue
                preds = [IS_A]
            else:
                preds = [IS_A, *pred_ids]
        preds = list(set(preds))
        def lbl2id(lbl):
            return labelmap.get(lbl, [lbl])
        expected_tgts = {x: lbl2id(x) for x in expected_rels.get(pred, [])}
        predicted_tgts = {x: lbl2id(x) for x in predicted_rels.get(pred, [])}
        for xs in predicted_tgts.values():
            all_predicted_tgts.update(xs)
        new_terms.update({x for x in predicted_tgts.keys() if x not in labelmap})
        #print(f"PRED: {pred} // predicted= {predicted_tgts} // expected= {expected_tgts}")
        
        predicted_accounted_for = set()
        expected_accounted_for = set()
        for expected_tgt in expected_tgts:
            row = RelationshipEval(term_id=term_id,
                                   term_label=adapter.label(term_id),
                                   pred=pred,
                                   expected_tgt=expected_tgt)
            if expected_tgt in predicted_tgts:
                #print(f"TP: {pred} {expected_tgt}")
                row.outcome = OutcomeType.TRUE_POSITIVE
                row.predicted_tgt = expected_tgt
                predicted_accounted_for.add(expected_tgt)
                expected_accounted_for.add(expected_tgt)
                rows.append(row)
            else:
                for expected_tgt_id in expected_tgts.get(expected_tgt, []):
                    ancs = adapter.ancestors(expected_tgt_id, preds)
                    ancs_as_lbls = [idmap.get(x,x) for x in ancs]
                    more_general_predictions = [x for x in ancs_as_lbls if x in predicted_tgts]
                    # note: we check for more specific later
                    if more_general_predictions:
                        #print(f"n/a PREDICTED_IS_MORE_GENERAL: {pred} {expected_tgt} // more_general_preds: {more_general_predictions}")
                        predicted_accounted_for.update(more_general_predictions)
                        expected_accounted_for.add(expected_tgt)
                        row.outcome = OutcomeType.FALSE_NEGATIVE
                        row.qualifier = QualifierType.PREDICTION_IS_MORE_GENERAL
                        row.predicted_tgt = "; ".join(more_general_predictions)
                        rows.append(row)
                        break
        for predicted_tgt in predicted_tgts:
            row = RelationshipEval(term_id=term_id,
                                   term_label=adapter.label(term_id),
                                   pred=pred,
                                   predicted_tgt=predicted_tgt,
                                   )
            if predicted_tgt in predicted_accounted_for:
                continue
            if predicted_tgt not in labelmap:
                row.predicted_tgt_exists = False
                #if ignore_ungrounded:
                #    continue
            ancs = adapter.ancestors(term_id, preds)
            ancs_as_lbls = [idmap.get(x, x) for x in ancs]
            #print(f"CHECKING {term_id} {pred} {predicted_tgt} // ancs: {ancs_as_lbls}")
            if predicted_tgt in ancs_as_lbls:
                row.outcome = OutcomeType.TRUE_POSITIVE
                continue
            row.outcome = OutcomeType.FALSE_POSITIVE
            for predicted_tgt_id in predicted_tgts.get(predicted_tgt, []):
                ancs = adapter.ancestors(predicted_tgt_id, preds)
                ancs_as_lbls = [idmap.get(x, x) for x in ancs]
                #print(pred, predicted_tgt_id, ancs_as_lbls)
                more_general_expected = [x for x in ancs_as_lbls if x in expected_tgts]
                if more_general_expected:
                    #print(f"?? PREDICTED_IS_MORE_SPECIFIC: {pred} {expected_tgt} // more_general_expected: {more_general_expected}")
                    expected_accounted_for.update(more_general_expected)
                    row.outcome = OutcomeType.FALSE_POSITIVE
                    row.qualifier = QualifierType.PREDICTION_IS_MORE_SPECIFIC
                    row.expected_tgt = "; ".join(more_general_expected)
                    break
                rows.append(row)
        for expected_tgt in expected_tgts:
            # any remaining expected targets that are unaccounted for are false negatives
            if expected_tgt in expected_accounted_for:
                continue
            row = RelationshipEval(term_id=term_id,
                                   term_label=adapter.label(term_id),
                                   pred=pred,
                                   expected_tgt=expected_tgt,
                                   outcome=OutcomeType.FALSE_NEGATIVE)
            rows.append(row)
    term_label = adapter.label(term_id)
    for row in rows:
        row.term_label=term_label
        row.ontology = ont
    return rows
                
rows = get_relationship_outcomes(
    {
         "subClassOf": ["Cell"],
         "PartOf": ["Cerebellum"], 
         "DevelopsFrom": ["StemCell"],
         "OnlyInTaxon": ["HomoSapiens"]
    }, 
    {
        "subClassOf": ["Neuron"], # more specific
        "PartOf": ["Brain", "MadeUp"], # more general, new
        "MadeUpRel": ["z", "y"], 
        "OnlyInTaxon": ["HomoSapiens"]
    }, 
    "cl",
    term_id="CL:0000540"
)
print(len(rows))
for row in rows:
    print(row.model_dump())
    
df = pd.DataFrame([row.as_dict() for row in rows])
df

MISSING: MadeUpRel
5
{'ontology': 'cl', 'method': None, 'term_id': 'CL:0000540', 'term_label': 'neuron', 'pred': 'OnlyInTaxon', 'outcome': <OutcomeType.TRUE_POSITIVE: 'true_positive'>, 'qualifier': None, 'expected_tgt': 'HomoSapiens', 'predicted_tgt': 'HomoSapiens', 'predicted_tgt_exists': True}
{'ontology': 'cl', 'method': None, 'term_id': 'CL:0000540', 'term_label': 'neuron', 'pred': 'subClassOf', 'outcome': <OutcomeType.FALSE_NEGATIVE: 'false_negative'>, 'qualifier': None, 'expected_tgt': 'Cell', 'predicted_tgt': None, 'predicted_tgt_exists': True}
{'ontology': 'cl', 'method': None, 'term_id': 'CL:0000540', 'term_label': 'neuron', 'pred': 'PartOf', 'outcome': <OutcomeType.FALSE_NEGATIVE: 'false_negative'>, 'qualifier': <QualifierType.PREDICTION_IS_MORE_GENERAL: 'prediction_is_more_general'>, 'expected_tgt': 'Cerebellum', 'predicted_tgt': 'Brain', 'predicted_tgt_exists': True}
{'ontology': 'cl', 'method': None, 'term_id': 'CL:0000540', 'term_label': 'neuron', 'pred': 'PartOf', 'outco

Unnamed: 0,ontology,method,term_id,term_label,pred,outcome,qualifier,expected_tgt,predicted_tgt,predicted_tgt_exists
0,cl,,CL:0000540,neuron,OnlyInTaxon,true_positive,,HomoSapiens,HomoSapiens,True
1,cl,,CL:0000540,neuron,subClassOf,false_negative,,Cell,,True
2,cl,,CL:0000540,neuron,PartOf,false_negative,prediction_is_more_general,Cerebellum,Brain,True
3,cl,,CL:0000540,neuron,PartOf,false_positive,,,MadeUp,False
4,cl,,CL:0000540,neuron,DevelopsFrom,false_negative,,StemCell,,True


In [80]:
def parse_relationships(rel_str: str) -> Dict[str, List[str]]:
    if pd.isnull(rel_str):
        return {}
    if not isinstance(rel_str, str):
        print(f"BAD: {rel_str}")
        return {}
    rels = yaml.safe_load(rel_str)
    m = defaultdict(list)
    for rel in rels:
        if "predicate" not in rel or "target" not in rel:
            print(f"BAD REL: {rel} in {rel_str}")
            continue
        if not isinstance(rel["target"], str):
            print(f"BAD REL: {rel} in {rel_str}")
            continue
        m[rel["predicate"]].append(rel["target"])
    return m

In [83]:
def parse_predictions_to_outcomes(f: str, ont: str, preds: List=None, strict=False, method="no_method") -> List[dict]:
    #adapter = get_adapter(f"sqlite:obo:{ont}")
    res = pd.read_csv(f, comment="#", sep="\t")
    rows = []
    for _, row in res.iterrows():
        expected_rels = parse_relationships(row['expected_relationships'])
        predicted_rels = parse_relationships(row['predicted_relationships'])
        term_id = row["masked_original_id"]
        if preds:
            expected_rels = {k: v for k, v in expected_rels.items() if k in preds}
            predicted_rels = {k: v for k, v in predicted_rels.items() if k in preds}
            base = f"{method}-{'+'.join(preds)}"
        else:
            base = method
        #print(f"Scoring: {expected_rels} // {predicted_rels}")
        this_rows = get_relationship_outcomes(expected_rels, predicted_rels, ont, term_id=term_id)
        rows.extend(this_rows)
    for row in rows:
        row.method = method
    return [row.as_dict() for row in rows]

rows = parse_predictions_to_outcomes("results/ont_cl-Prelationships-Mid.original_id-BGFalse-TrNone-Te50-Mgpt-4-EMopenai.results.tsv", "cl", method="RAG+gpt4")
df = pd.DataFrame(rows)
df.to_csv("foo.tsv", sep="\t")
df


MISSING: ReceivesConnectivityInputFrom
MISSING: AxonIn
MISSING: ConnectsTo
MISSING: HasSynapticConvergenceOf
MISSING: HasSynapticDivergentTo
MISSING: HasCytosolPart
MISSING: isPartOf
MISSING: minimalContactWith
MISSING: SynapsesWith
MISSING: LacksIntracellularComponent


Unnamed: 0,ontology,method,term_id,term_label,pred,outcome,qualifier,expected_tgt,predicted_tgt,predicted_tgt_exists
0,cl,RAG+gpt4,CL:4033045,lung migratory dendritic cell,subClassOf,true_positive,,DendriticCell,DendriticCell,True
1,cl,RAG+gpt4,CL:4033045,lung migratory dendritic cell,PartOf,false_positive,,,Lung,True
2,cl,RAG+gpt4,CL:4033019,ON-blue cone bipolar cell,subClassOf,false_positive,,,PhotopicRetinalBipolarCell,False
3,cl,RAG+gpt4,CL:4033019,ON-blue cone bipolar cell,subClassOf,false_negative,,ONBipolarCell,,True
4,cl,RAG+gpt4,CL:4033029,diffuse bipolar 3a cell,subClassOf,false_positive,,,OFFCalbindinPositiveBipolarCell,False
...,...,...,...,...,...,...,...,...,...,...
157,cl,RAG+gpt4,CL:4030037,late spermatid,CapableOf,false_negative,,SpermatidDevelopment,,True
158,cl,RAG+gpt4,CL:4030037,late spermatid,subClassOf,true_positive,,Spermatid,Spermatid,True
159,cl,RAG+gpt4,CL:4030037,late spermatid,PartOf,false_negative,,MaleReproductiveSystem,,True
160,cl,RAG+gpt4,CL:4030035,dental pulp stem cell,subClassOf,true_positive,,StemCell,StemCell,True


In [None]:
outcome = score_rels({"subClassOf": ["DentalPulpStemCell"]}, {"PartOf": ["DentalPulp"]}, "cl", term_id="CL:4030035")
outcome
assert outcome.tp == 1
assert outcome.fp == 0
assert outcome.fn == 1

In [None]:
assert score_rels({"subClassOf": ["MadeUp"]}, {}, "cl").matches(Outcome(tp=0, fp=0, fn=1))

In [None]:
assert score_rels({}, {}, "cl").matches(Outcome(tp=0, fp=0, fn=0))

In [None]:
assert score_rels({"subClassOf": ["Cell"]}, {}, "cl").matches(Outcome(tp=0, fp=0, fn=1, other=[]))

In [None]:
assert score_rels({}, {"subClassOf": ["Cell"]}, "cl") .matches(Outcome(tp=0, fp=1, fn=0))

In [None]:
assert score_rels({"subClassOf": ["Cell"]}, {"subClassOf": ["Cell"]}, "cl").matches(Outcome(tp=1, fp=0, fn=0))

In [None]:
outcome = score_rels({"subClassOf": ["Cell"]}, {"subClassOf": ["Neuron"]}, "cl")
assert outcome.tp == 0
assert outcome.fp == 0
assert outcome.fn == 0
assert len(outcome.other) == 1

In [13]:
outcome = score_rels({"subClassOf": ["Neuron"]}, {"subClassOf": ["Cell"]}, "cl")
print(outcome)
assert outcome.tp == 0
assert outcome.fp == 0
assert outcome.fn == 0.5
assert len(outcome.other) == 1

tp=0 fp=0 fn=0.5 other=[('subClassOf', 'Cell', 'PredictedMoreGeneralThan', 'Neuron')] tp_list=[] fp_list=[] fn_list=[] new_terms=[] rows=[{'term_id': None, 'term_label': None, 'pred': 'subClassOf', 'expected_tgt': 'Neuron', 'outcome': 'tp[more_general]'}]


In [None]:
assert score_rels({"subClassOf": ["Neuron"]}, {"subClassOf": ["StemCell"]}, "cl").matches(Outcome(tp=0, fp=1, fn=1))


In [None]:
def load_relationships_results_tsv(f: str, ont: str, preds: List=None, strict=False, method="no_method"):
    #adapter = get_adapter(f"sqlite:obo:{ont}")
    res = pd.read_csv(f, comment="#", sep="\t")
    rows = []
    for _, row in res.iterrows():
        expected_rels = parse_relationships(row['expected_relationships'])
        predicted_rels = parse_relationships(row['predicted_relationships'])
        term_id = row["masked_original_id"]
        if preds:
            expected_rels = {k: v for k, v in expected_rels.items() if k in preds}
            predicted_rels = {k: v for k, v in predicted_rels.items() if k in preds}
            base = f"{method}-{'+'.join(preds)}"
        else:
            base = method
        #print(f"Scoring: {expected_rels} // {predicted_rels}")
        outcome = score_rels(expected_rels, predicted_rels, ont, term_id=term_id, strict=strict, base=base, ignore_ungrounded=True)
        if outcome.other:
            print(f"Other: {outcome}")
        outcome = outcome.model_dump()
        #outcome["expected_relationships"] = json.dumps(expected_rels)
        #outcome["predicted_relationships"] = json.dumps(predicted_rels)
        outcome = {"ontology": ont, 
                   "preds": ",".join(preds) if preds else "all",
                   **outcome, **row.to_dict()}
        rows.append(outcome)
    return rows

#rows = load_relationships_results_tsv("results/ont_cl-Prelationships-Mid.original_id-BGFalse-TrNone-Te50-Mgpt-4-EMopenai.results.tsv", "cl", strict=False, method="RAG+gpt4")
#df = pd.DataFrame(rows)
#df.to_csv("foo.tsv", sep="\t")
#df

In [None]:
#rows = load_relationships_results_tsv("results/ont_cl-Prelationships-Mid.original_id-BGFalse-TrNone-Te50-Mgpt-4-EMopenai.results.tsv", "cl", preds=["subClassOf"])
#df = pd.DataFrame(rows)
#df

In [87]:
def get_method(obj: dict) -> str:
    """
    gets the method from the yaml object
    """
    if obj.get("generate_background"):
        return "RAG+background"
    additional = obj.get("additional_collections", [])
    if additional:
        assert len(additional) == 1
        x = additional[0]
        if x.startswith("gh_"):
            return "RAG+github"
        elif x.startswith("devdocs"):
            return "RAG+devdocs"
        else:
            assert False
    else:
        return "RAG"

In [None]:
import re

def load_all_results(preds=None):
    files = glob.glob("results/*.results.yaml")
    rows = []
    for f in files:
        print(f)
        obj = yaml.safe_load(open(f))
        if "Prelationships" not in f:
            continue
        # look for string '/ont_{ONT}-Prelationships' using regex
        m = re.search(r"ont_(\w+)-Prelationships", f)
        assert m
        meth = get_method(obj) + "-" + obj["model_name"]
        print(f, meth)
        ont = m.group(1)
        this_rows = load_relationships_results_tsv(f.replace(".yaml", ".tsv"), ont, preds, strict=False, method=meth)
        for row in this_rows:
            row = {**obj, **row}
            row["method"] = meth
            rows.append(row)
        #rows.extend(this_rows)
        
    return pd.DataFrame(rows)


In [None]:
df = load_all_results()

In [None]:
df.to_csv("results/relationship-analysis.tsv", sep="\t", index=False)

In [None]:
isa_df = load_all_results(["subClassOf"])
isa_df.to_csv("results/relationship-analysis-isa.tsv", sep="\t")

In [88]:
import re

def load_all_results_new(preds=None):
    files = glob.glob("results/*.results.yaml")
    rows = []
    for f in files:
        print(f)
        obj = yaml.safe_load(open(f))
        if "Prelationships" not in f:
            continue
        # look for string '/ont_{ONT}-Prelationships' using regex
        m = re.search(r"ont_(\w+)-Prelationships", f)
        assert m
        meth = get_method(obj) + "-" + obj["model_name"]
        print(f, meth)
        ont = m.group(1)
        this_rows = parse_predictions_to_outcomes(f.replace(".yaml", ".tsv"), ont, preds, strict=False, method=meth)
        for row in this_rows:
            row = {**obj, **row}
            # row["method"] = meth
            rows.append(row)
        #rows.extend(this_rows)
        
    return pd.DataFrame(rows)


In [89]:
df = load_all_results_new()
df

results/ont_oba-Plabel-Mid.original_id-BGFalse-TrNone-Te50-Mgpt-3.5-turbo-EMopenai.results.yaml
results/ont_cl-Pdefinition-Mid.original_id-BGTrue-TrNone-Te50-Mgpt-4-EMopenai.results.yaml
results/ont_go-Plabel-Mid.original_id-BGTrue-TrNone-Te50-Mgpt-4-EMopenai.results.yaml
results/ont_uberon-Plabel-Mid.original_id-BGFalse-TrNone-Te40-Mgpt-4-EMopenai.results.yaml
results/ont_mondo-Plabel-Mid.original_id-BGTrue-TrNone-Te50-Mgpt-3.5-turbo-EMopenai.results.yaml
results/ont_go-Prelationships-Mid.original_id-BGTrue-TrNone-Te50-Mgpt-4-EMopenai.results.yaml
results/ont_go-Prelationships-Mid.original_id-BGTrue-TrNone-Te50-Mgpt-4-EMopenai.results.yaml RAG+background-gpt-4
MISSING: HasProduct
results/ont_envo-Plabel-Mid.original_id-BGTrue-TrNone-Te50-Mgpt-4-EMopenai.results.yaml
results/ont_mondo-Plabel-Mid.original_id-BGFalse-TrNone-Te50-Mnous-hermes-13b-EMopenai.results.yaml
results/ont_hp-Prelationships-Mid.original_id-BGFalse-TrNone-Te50-Mgpt-4-EMopenai.results.yaml
results/ont_hp-Prelationshi

Unnamed: 0,model_name,embedding_model_name,generate_background,task_started,task_finished,executed_on,agent,extractor,method,source_db_path,...,ontology,term_id,term_label,pred,outcome,qualifier,expected_tgt,predicted_tgt,predicted_tgt_exists,additional_collections
0,gpt-4,openai:,True,2023-09-01 10:18:02.586748,2023-09-01 10:48:40.723027,Darwin-21.6.0-Darwin Kernel Version 21.6.0: Mo...,dae,BasicExtractor,RAG+background-gpt-4,db,...,go,GO:0160073,Casparian strip assembly,subClassOf,true_positive,,CellularComponentAssembly,CellularComponentAssembly,True,
1,gpt-4,openai:,True,2023-09-01 10:18:02.586748,2023-09-01 10:48:40.723027,Darwin-21.6.0-Darwin Kernel Version 21.6.0: Mo...,dae,BasicExtractor,RAG+background-gpt-4,db,...,go,GO:0160073,Casparian strip assembly,PartOf,false_negative,,PlantTypeCellWallAssembly,,True,
2,gpt-4,openai:,True,2023-09-01 10:18:02.586748,2023-09-01 10:48:40.723027,Darwin-21.6.0-Darwin Kernel Version 21.6.0: Mo...,dae,BasicExtractor,RAG+background-gpt-4,db,...,go,GO:0160073,Casparian strip assembly,ResultsInAssemblyOf,true_positive,,CasparianStrip,CasparianStrip,True,
3,gpt-4,openai:,True,2023-09-01 10:18:02.586748,2023-09-01 10:48:40.723027,Darwin-21.6.0-Darwin Kernel Version 21.6.0: Mo...,dae,BasicExtractor,RAG+background-gpt-4,db,...,go,GO:0120340,radial spoke base 1,subClassOf,true_positive,,RadialSpokeBase,RadialSpokeBase,True,
4,gpt-4,openai:,True,2023-09-01 10:18:02.586748,2023-09-01 10:48:40.723027,Darwin-21.6.0-Darwin Kernel Version 21.6.0: Mo...,dae,BasicExtractor,RAG+background-gpt-4,db,...,go,GO:0120340,radial spoke base 1,PartOf,true_positive,,RadialSpoke1,RadialSpoke1,True,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5229,gpt-4,openai:,True,2023-09-03 22:37:08.004964,2023-09-03 23:02:01.765563,Darwin-21.6.0-Darwin Kernel Version 21.6.0: Mo...,dae,BasicExtractor,RAG+background-gpt-4,db,...,foodon,FOODON:00004454,green cardamom seed (dried),subClassOf,true_positive,,CardamomFoodProduct,CardamomFoodProduct,True,
5230,gpt-4,openai:,True,2023-09-03 22:37:08.004964,2023-09-03 23:02:01.765563,Darwin-21.6.0-Darwin Kernel Version 21.6.0: Mo...,dae,BasicExtractor,RAG+background-gpt-4,db,...,foodon,FOODON:00004454,green cardamom seed (dried),DerivesFrom,false_positive,,,GreenCardamomPlant,True,
5231,gpt-4,openai:,True,2023-09-03 22:37:08.004964,2023-09-03 23:02:01.765563,Darwin-21.6.0-Darwin Kernel Version 21.6.0: Mo...,dae,BasicExtractor,RAG+background-gpt-4,db,...,foodon,FOODON:00004438,common millet (cooked),subClassOf,false_positive,,,Food_cooked_,False,
5232,gpt-4,openai:,True,2023-09-03 22:37:08.004964,2023-09-03 23:02:01.765563,Darwin-21.6.0-Darwin Kernel Version 21.6.0: Mo...,dae,BasicExtractor,RAG+background-gpt-4,db,...,foodon,FOODON:00004438,common millet (cooked),subClassOf,false_positive,,,Common_millet_wholeOrPieces_,False,


In [90]:
df.to_csv("results/predicted-relationships-for-eval.tsv", sep="\t", index=False)

In [91]:
df.query("method == 'RAG-gpt-4'").to_csv("results/predicted-relationships-for-eval-rag-gpt-4.tsv", sep="\t", index=False)

## Loading YAML result objects

Each run generates a single yaml object

In [None]:


def load_results(f):
    """Loads main results summaries"""
    obj = yaml.safe_load(open(f))
    ont = obj["source_collection"].replace("ont_", "")
    for k, v in obj["results"].items():
        obj[k] = v
    del obj["results"]
    obj["task"] = " ".join(obj["fields_to_predict"])
    obj["ontology"] = ont
    #additional = obj.get("additional_collections", [])
    #is_github = any(x for x in additional if x.startswith("gh_")) if additional else []
    #obj["method"] = "background" if obj.get("generate_background") else ("github" if is_github else "direct")
    obj["method"] = get_method(obj)
    return obj

from pathlib import Path
def load_granular_results():
    files = glob.glob("results/*.results.tsv")
    dfs = []
    for file in files:
        yaml_path = file.replace(".tsv", ".yaml")
        if not Path(yaml_path).exists():
            continue
        meta = yaml.safe_load(open(yaml_path))
        # print(file)
        res = pd.read_csv(file, comment="#", sep="\t")
        #res['task'] = res.apply(lambda row :  " ".join(row.get("fields_to_predict", [])), axis = 1)
        #res['ontology'] = res.apply(lambda row :  row.get("source_collection", "").replace("ont_", ""), axis = 1)
        res['task'] =  " ".join(meta.get("fields_to_predict"))
        res['ontology'] = meta.get("source_collection", "").replace("ont_", "")
        res["method"] = get_method(meta)
        res["model_name"] = meta["model_name"]
        res["accuracy"] = res["metric_accuracy"]
        dfs.append(res)
    return pd.concat(dfs)

all_df = load_granular_results()
all_df.describe(include='all')

In [None]:
import glob

def load_all_results() -> List[dict]:
    """
    load all yaml files
    """
    files = glob.glob("results/*.results.yaml")
    objs = [load_results(f) for f in files]
    return objs

objs = load_all_results()
len(objs)
    

In [None]:
df = pd.DataFrame(objs)
df = df.query("model_name != 'llama2'") # incomplete
df.describe(include="all")

In [None]:
df.query(f"task=='relationships' and method=='direct' and model_name=='gpt-4'")

In [None]:

# Line plot for precision across different models and methods
#sns.lineplot(data=df, x='model_name', y='precision', hue='method')
#plt.show()

# Heatmap for precision
heatmap_data = pd.pivot_table(df, values='precision', 
                              index=['model_name'], 
                              columns='method')
sns.heatmap(heatmap_data, annot=True, cmap="YlGnBu")
plt.show()

# Boxplot for precision across different methods
sns.boxplot(data=df, x='method', y='precision')
plt.show()

sns.boxplot(data=df, x='method', y='recall')
plt.show()

sns.boxplot(data=df, x='method', y='accuracy')
plt.show()



In [None]:
from scipy import stats


def hm(task, method, df_x=None, metric="accuracy"):
    if df_x is None:
        df_x = df
    df_x = df_x.query(f"task=='{task}' and method=='{method}'")

    # Heatmap for precision
    heatmap_data = pd.pivot_table(df_x, values=metric, 
                                  index=['model_name'], 
                                  columns='ontology')
    sns.heatmap(heatmap_data, annot=True, cmap="YlGnBu")
    plt.show()

def my_ttest(task, method):
    all_df_x = all_df.query(f"task=='{task}' and method=='{method}'")
    models = all_df_x["model_name"].unique()
    for m1 in models:
        for m2 in models:
            if m2 >= m1:
                continue
            model1_data = all_df_x[(all_df_x['model_name'] == m1)]['accuracy']
            model2_data = all_df_x[(all_df_x['model_name'] == m2)]['accuracy']

            # Step 2: Conduct the T-Test
            # Use ttest_ind to conduct the t-test
            t_statistic, p_value = stats.ttest_ind(model1_data, model2_data)

            print(f'{m1} v {m2}: len {len(model1_data)} v len {len(model2_data)}')
            print(f'{m1} v {m2}: T-statistic: {t_statistic}')
            print(f'{m1} v {m2}: P-value: {p_value}')
    
    

def lp(task, df_x=None):
    if df_x is None:
        df_x = df
    df_x = df_x.query(f"task=='{task}'")

    sns.lineplot(data=df_x, x='model_name', y='accuracy', hue='method')
    plt.show()

## Relationships Task

In [None]:
hm("relationships", "direct")

In [None]:
hm("relationships", "direct", metric="precision")

In [None]:
my_ttest("relationships", "direct")

In [None]:
lp("relationships")

## Labels Task

In [None]:
hm("label", "direct")

In [None]:
my_ttest("label", "direct")

In [None]:
lp("label")

## Logical Definitions

In [None]:
import numpy as np

def load_ldef_results_tsv(f):
    res = pd.read_csv(f, comment="#", sep="\t")
    res['expected_logical_definition'].replace('', np.nan, inplace=True)
    res = res[res['expected_logical_definition'].notnull()]
    # Calculate Accuracy per row: (TP + TN) / (TP + TN + FP + FN)
    # Since we can't calculate TN here, we'll omit it from the formula
    res['accuracy'] = res['metric_true_positives'] / (res['metric_true_positives'] + res['metric_false_positives'] + res['metric_false_negatives'])
    res['precision'] = res['metric_true_positives'] / (res['metric_true_positives'] + res['metric_false_positives'])

    return res
    

res = load_ldef_results_tsv("results/ont_go-Plogical_definition-Mid.original_id-BGFalse-TrNone-Te50-Mgpt-4-EMopenai.results.tsv")
res.describe(include="all")

In [None]:
def load_all_results_ldefs():
    files = glob.glob("results/*.results.yaml")
    objs = []
    for f in files:
        obj = load_results(f)
        if obj["task"] != "logical_definition":
            continue
        accuracy = load_ldef_results_tsv(f.replace(".yaml", ".tsv"))["accuracy"].mean()
        obj["accuracy"] = accuracy
        objs.append(obj)
    return objs

objs = load_all_results_ldefs()

In [None]:
ldefs_df = pd.DataFrame(objs)

### Logical Definitions (Excluding terms with none)

In [None]:
hm("logical_definition", "direct", ldefs_df)

### Logical Definitions (including over-prediction)

In [None]:
hm("logical_definition", "direct")

## Definitions Task

### Definitions (direct)

No additional background knowledge

In [None]:
hm("definition", "direct")

In [None]:
my_ttest("definition", "direct")

### Definitions with github as background method

In [None]:
hm("definition", "github")

### Definitions with auto-generated background summaries

In [None]:
hm("definition", "background")

In [None]:
lp("definition")

### Definitions using developer docs

In [None]:
df.query("ontology == 'cl' and task == 'definition'")["method"].unique()

In [None]:
hm("definition", "devdocs", df.query("ontology == 'cl'"))

### Significance of different methods

In [None]:
methods = all_df["method"].unique()
models = all_df["model_name"].unique()
for m1 in methods:
    for m2 in methods:
        if m2 >= m1:
            continue
        for mn in list(models) + [None]:
            if mn is None:
                all_df_x = all_df.query("task == 'definition'")
            else:
                all_df_x = all_df.query(f"task == 'definition' and model_name == '{mn}'")
            m1_data = all_df_x[(all_df_x['method'] == m1)]['accuracy']
            m2_data = all_df_x[(all_df_x['method'] == m2)]['accuracy']

            # Step 2: Conduct the T-Test
            # Use ttest_ind to conduct the t-test
            t_statistic, p_value = stats.ttest_ind(m1_data, m2_data)

            print(f'{mn}:: {m1} v {m2}: T-statistic: {t_statistic}')
            print(f'{mn}:: {m1} v {m2}: P-value: {p_value}')


## Subsumption Analysis

Note that the relationships stats above are calculated on predicting *all* relationships.

To compare with owl2vec, we filter these for only subsumptions (subClassOf) relationships. See https://arxiv.org/pdf/2009.14654.pdf


In [None]:
import json

def subsumptions(rel_str: str):
    if pd.isnull(rel_str):
        return []
    if not isinstance(rel_str, str):
        print(f"BAD: {rel_str}")
        return []
    rels = yaml.safe_load(rel_str)
    for rel in rels:
         if "predicate" not in rel:
             print(f"BAD REL: {rel} in {rel_str}")
    return [r["target"] for r in rels if r.get("predicate", "") =="subClassOf"]

def calculate_metrics(row):
    expected_set = set(row['expected_subsumptions'])
    predicted_set = set(row['predicted_subsumptions'])
    
    TP = len(expected_set & predicted_set)
    FP = len(predicted_set - expected_set)
    FN = len(expected_set - predicted_set)
    
    # In a classical setting, TN would be calculated here. 
    # However, if you don't have information about what could have been predicted 
    # but wasn't (i.e., the universal set), TN can't be calculated.
    
    return pd.Series({'TP': TP, 'FP': FP, 'FN': FN})
    

def load_subsumption_results_tsv(f):
    res = pd.read_csv(f, comment="#", sep="\t")
    res['expected_subsumptions'] = res.apply(lambda row : subsumptions(row['expected_relationships']), axis = 1)
    res['predicted_subsumptions'] = res.apply(lambda row : subsumptions(row['predicted_relationships']), axis = 1)
    metrics_df = res.apply(calculate_metrics, axis=1)

    # Add the metrics as new columns to the original DataFrame
    df_with_metrics = pd.concat([df, metrics_df], axis=1)

    # Calculate Accuracy per row: (TP + TN) / (TP + TN + FP + FN)
    # Since we can't calculate TN here, we'll omit it from the formula
    df_with_metrics['accuracy'] = df_with_metrics['TP'] / (df_with_metrics['TP'] + df_with_metrics['FP'] + df_with_metrics['FN'])
    df_with_metrics['precision'] = df_with_metrics['TP'] / (df_with_metrics['TP'] + df_with_metrics['FP'])

    return df_with_metrics
    

res = load_subsumption_results_tsv("results/ont_uberon-Prelationships-Mid.original_id-BGFalse-TrNone-Te40-Mgpt-4-EMopenai.results.tsv")
res.describe(include="all")

In [None]:
import json

def subsumptions(rel_str: str):
    if pd.isnull(rel_str):
        return []
    if not isinstance(rel_str, str):
        print(f"BAD: {rel_str}")
        return []
    rels = yaml.safe_load(rel_str)
    for rel in rels:
         if "predicate" not in rel:
             print(f"BAD REL: {rel} in {rel_str}")
    return [r["target"] for r in rels if r.get("predicate", "") =="subClassOf"]

def calculate_metrics(row):
    expected_set = set(row['expected_subsumptions'])
    predicted_set = set(row['predicted_subsumptions'])
    
    TP = len(expected_set & predicted_set)
    FP = len(predicted_set - expected_set)
    FN = len(expected_set - predicted_set)
    
    # In a classical setting, TN would be calculated here. 
    # However, if you don't have information about what could have been predicted 
    # but wasn't (i.e., the universal set), TN can't be calculated.
    
    return pd.Series({'TP': TP, 'FP': FP, 'FN': FN})
    

def load_subsumption_results_tsv(f):
    res = pd.read_csv(f, comment="#", sep="\t")
    res['expected_subsumptions'] = res.apply(lambda row : subsumptions(row['expected_relationships']), axis = 1)
    res['predicted_subsumptions'] = res.apply(lambda row : subsumptions(row['predicted_relationships']), axis = 1)
    metrics_df = res.apply(calculate_metrics, axis=1)

    # Add the metrics as new columns to the original DataFrame
    df_with_metrics = pd.concat([df, metrics_df], axis=1)

    # Calculate Accuracy per row: (TP + TN) / (TP + TN + FP + FN)
    # Since we can't calculate TN here, we'll omit it from the formula
    df_with_metrics['accuracy'] = df_with_metrics['TP'] / (df_with_metrics['TP'] + df_with_metrics['FP'] + df_with_metrics['FN'])
    df_with_metrics['precision'] = df_with_metrics['TP'] / (df_with_metrics['TP'] + df_with_metrics['FP'])

    return df_with_metrics
    

#res = load_subsumption_results_tsv("results/ont_uberon-Prelationships-Mid.original_id-BGFalse-TrNone-Te40-Mgpt-4-EMopenai.results.tsv")
#res.describe(include="all")

In [None]:
res["accuracy"].mean()

In [None]:
def load_all_results_subsumptions():
    files = glob.glob("results/*.results.yaml")
    objs = []
    for f in files:
        obj = load_results(f)
        if obj["task"] != "relationships":
            continue
        accuracy = load_subsumption_results_tsv(f.replace(".yaml", ".tsv"))["accuracy"].mean()
        obj["accuracy"] = accuracy
        objs.append(obj)
    return objs

objs = load_all_results_subsumptions()

### Main subsumption results

In [None]:
subs_df = pd.DataFrame(objs)

In [None]:
hm("relationships", "direct", subs_df)

In [None]:
hm("relationships", "direct", subs_df, metric="precision")

### Subsumptions using background

In [None]:
hm("relationships", "background", subs_df)

### Subsumptions using github

In [None]:
#hm("relationships", "github", subs_df)

## Preparing definitions for evaluation

In [None]:
defs_df = all_df.query("task == 'definition'")
defs_df.groupby('ontology').size().reset_index(name='count')

In [None]:
len(defs_df)

In [None]:
len(defs_df["expected_definition"].unique())

In [None]:

df_defs_gpt_direct = pd.read_csv("results/cl-defs-direct.tsv", sep="\t")
gpt_direct = { row["id"]: row["definition"] for _, row in df_defs_gpt_direct.iterrows() }
objs = []
for _, row in defs_df.iterrows():
    id = row["masked_original_id"]
    if id not in gpt_direct:
        continue
    row = {**row}
    row["predicted_definition"] = gpt_direct[id]
    row["model_name"] = "gpt-4"
    row["method"] = "no_RAG"
    del gpt_direct[id]
    objs.append(row)
all_defs_df = pd.concat([defs_df, pd.DataFrame(objs)])
all_defs_df