# Creating the evaluation Dataset with Langfuse

In [94]:
import os
import re 
from datetime import datetime
import json
import importlib
import pandas as pd
from dotenv import load_dotenv
from langsmith import Client, evaluate
from phenopy.score import Scorer
from langsmith.schemas import Run, Example
from phenopy.build_hpo import generate_annotated_hpo_network
import sys
sys.path.append('../utils')

import customchain as cc
custom_chain = cc.custom_chain

import rawgptchain as rgc
rawgptchain = rgc.rawgptchain

In [95]:
load_dotenv(override=True)

True

Creación de los datasets de RAG-HPO y GSC

In [None]:
from langfuse import Langfuse
 
langfuse = Langfuse()

In [89]:
langfuse.create_dataset(
    name="RAGHPO",
    # optional description
    description="Dataset en español para la evaluación de herramientas de codificación fenotípica.",
    # optional metadata
    metadata={
        "author": "mdiazrio",
        "date": "2025-04-25",
        "type": "benchmark"
    }
)

Dataset(id='cm9x6ivv80096pf0604is8yzj', name='RAGHPO', description='Dataset en español para la evaluación de herramientas de codificación fenotípica.', metadata={'date': '2025-04-25', 'type': 'benchmark', 'author': 'mdiazrio'}, project_id='cm9vlvsif0006pf07xcychmbe', created_at=datetime.datetime(2025, 4, 25, 19, 23, 13, 748000, tzinfo=datetime.timezone.utc), updated_at=datetime.datetime(2025, 4, 25, 19, 23, 13, 748000, tzinfo=datetime.timezone.utc))

In [13]:
df = pd.read_csv('../../datasets/RAG-HPO/Test_Cases.csv')
df["annotations"] = df.annotations.apply(eval)
df = df.rename(columns={'esp':'clinical_note'})
input_keys = ['clinical_note']
output_keys = ['annotations'] 

In [14]:
RESOURCES_DIR="../../resources"
with open(os.path.join(RESOURCES_DIR, "hpo_es.json"), "r") as fp:
    hpo = json.load(fp)
valid_ids = [x['id'] for x in hpo]

In [18]:
def process_output(output):
    hpo_list = [s.strip() for s in output]
    hpo_list = [j for j in hpo_list if re.compile(r"^HP:\d{7}$").match(j)]
    hpo_list = [i for i in hpo_list if i in valid_ids]
    return {"annotations":hpo_list}

In [92]:
for idx, row in df.iterrows(): 
    langfuse.create_dataset_item(
        dataset_name="RAGHPO",
        input={"clinical_note": row['clinical_note']},
        expected_output=process_output(row['annotations'])
    )

GSCESP

In [121]:
langfuse.create_dataset(
    name="GSCESP",
    # optional description
    description="Dataset en español para la evaluación de herramientas de codificación fenotípica.",
    # optional metadata
    metadata={
        "author": "mdiazrio",
        "date": "2025-04-26",
        "type": "benchmark"
    }
)

Dataset(id='cm9xwlj4n015vpf06u8x2xkjn', name='GSCESP', description='Dataset en español para la evaluación de herramientas de codificación fenotípica.', metadata={'date': '2025-04-26', 'type': 'benchmark', 'author': 'mdiazrio'}, project_id='cm9vlvsif0006pf07xcychmbe', created_at=datetime.datetime(2025, 4, 26, 7, 33, 7, 223000, tzinfo=datetime.timezone.utc), updated_at=datetime.datetime(2025, 4, 26, 7, 33, 7, 223000, tzinfo=datetime.timezone.utc))

In [120]:
texts = []
annotations = []
for file in os.listdir("../../datasets/GCS+_ESP/Text"):
    with open(os.path.join("../../datasets/GCS+_ESP/Text", file), "r") as fp:
        texts.append(fp.read())
    annots = pd.read_csv(os.path.join("../../datasets/GSC+/Annotations", file), header=None, sep="\t")
    annots[1] = annots[1].apply(lambda x: x.split("|")[0].strip())
    annotations.append(annots[1].to_list())

In [103]:
gcs_esp = pd.DataFrame({"esp":texts, "annotations":annotations})
gcs_esp.rename(columns={"esp":"clinical_note"}, inplace=True)

In [119]:
def process_output(output):
    hpo_list = [s.strip() for s in output]
    hpo_list = [s.replace('_', ':') for s in hpo_list]
    hpo_list = [j for j in hpo_list if re.compile(r"^HP:\d{7}$").match(j)]
    hpo_list = [i for i in hpo_list if i in valid_ids]
    hpo_list = list(set(hpo_list))
    return {"annotations":hpo_list}

In [122]:
for idx, row in gcs_esp.iterrows(): 
    langfuse.create_dataset_item(
        dataset_name="GSCESP",
        input={"clinical_note": row['clinical_note']},
        expected_output=process_output(row['annotations'])
    )

Definición de las métricas

In [69]:
def clean_final_answer(outputs):
    try:
        return [code.hpo_code.strip() for code in outputs["final answer"]]
    except:
        return outputs["final answer"]

In [39]:
phenopy_data_directory = "../../resources/"

# files used in building the annotated HPO network
obo_file = os.path.join(phenopy_data_directory, 'hp.obo')
disease_to_phenotype_file = os.path.join(phenopy_data_directory, 'phenotype.hpoa')

hpo_network, alt2prim, disease_records = \
    generate_annotated_hpo_network(obo_file,
                                   disease_to_phenotype_file)

scorer = Scorer(hpo_network)

In [42]:
# We can still pass in Run and Example objects if we'd like
def traditional_metrics(outputs: dict, reference_outputs: dict) -> list[dict]:
    """Check precision, recall and f1."""
    predicted_terms = clean_final_answer(outputs)
    real_terms = reference_outputs["annotations"]
    precision = 0 if len(predicted_terms) == 0 else sum([int(term in real_terms) for term in predicted_terms]) / len(predicted_terms)
    recall = 0 if len(real_terms) == 0 else sum([int(term in predicted_terms) for term in real_terms]) / len(real_terms)
    f1 = 0 if (precision + recall) == 0 else round(2 * (precision * recall) / (precision + recall),2)

    return [
        {"key": "precision", "score": precision},
        {"key": "recall", "score": recall},
        {"key": "f1", "score": f1},
    ]

In [71]:
def semantic_similarity(outputs: dict, reference_outputs: dict)->float:
    """Check semantic similarity using phenopy."""
    predicted_terms = clean_final_answer(outputs)
    real_terms = reference_outputs["annotations"]
    try:
        score = scorer.score_term_sets_basic(predicted_terms, real_terms)
    except:
        score = -1
    return [{"key": "semantic similarity", "score":score }]

In [47]:
def jaccard_similarity(outputs: dict, reference_outputs: dict)->float:
    """Check Jaccard similarity between two sets."""
    predicted_terms = set(clean_final_answer(outputs))
    real_terms = set(reference_outputs["annotations"])   
    intersection = predicted_terms.intersection(real_terms)
    union = predicted_terms.union(real_terms)
    if not union:
        return 1.0  # define similarity as 1.0 when both are empty
    return [{"key": "jaccard_similarity", "score": len(intersection) / len(union)}]

In [45]:
def retrieve_accuracy(outputs: dict, reference_outputs: dict)->float:
    """Check retriever accuracy and recall."""
    predicted_candidates = set().union(*outputs["docs"])
    real_terms = set(reference_outputs["annotations"])
    recall = len(real_terms & predicted_candidates) / len(real_terms)           # = 2/2 = 1.0
    precision = len(real_terms & predicted_candidates) / len(predicted_candidates)   

    return [{"key": "r_precision", "score": precision}, 
            {"key": "r_recall", "score": recall}]

In [78]:
importlib.reload(rgc)
rawgptchain = rgc.rawgptchain

In [93]:
dataset = langfuse.get_dataset("RAGHPO")
evaluators = [traditional_metrics, semantic_similarity, jaccard_similarity] 
run_name = datetime.now().strftime("%Y-%m-%d_%H-%M-%S")

for item in dataset.items:
    try:
        handler = item.get_langchain_handler(run_name=run_name)
        response = rawgptchain.with_config({ "callbacks": [handler]}).invoke(item.input)
        for evaluator in evaluators:
            scores = evaluator(response, item.expected_output)
            for score in scores:
                langfuse.score(trace_id=handler.get_trace_id(), name=score["key"], value=score["score"])
    except:
        continue
# Flush the langfuse client to ensure all data is sent to the server at the end of the experiment run
langfuse.flush()

  avg = a.mean(axis, **keepdims_kw)
  ret = ret.dtype.type(ret / rcount)


In [74]:
from langchain_chroma import Chroma
from langchain_voyageai import VoyageAIEmbeddings
embeddings_model = VoyageAIEmbeddings(model="voyage-3")
vectordb = Chroma(persist_directory="../../chroma_db/Voyage3", embedding_function=embeddings_model, 
                  collection_name="hpo_ontology_esp")
retriever = vectordb.as_retriever(search_kwargs={"k": 10})

In [75]:
vectordb.get_by_ids(ids)

[Document(id='HP:0001539', metadata={'hpo_id': 'HP:0001539', 'lineage': 'HP:0004299->HP:0003549->HP:0033127->HP:0010866->HP:0000118->HP:0004298->HP:0025031->HP:0100790'}, page_content='Hernia umbilical. Cierre incompleto anterior de la línea media de la pared abdominal en el que hay herniación de las vísceras abdominales hacia la base de la cuerda abdominal.'),
 Document(id='HP:0001973', metadata={'hpo_id': 'HP:0001973', 'lineage': 'HP:0002960->HP:0011875->HP:0002715->HP:0001873->HP:0001872->HP:0000118->HP:0011873->HP:0010978->HP:0001871'}, page_content='Trombocitopenia autoinmune. La presencia de trombocitopenia en combinación con la detección de anticuerpos antiplaquetarios.'),
 Document(id='HP:0009738', metadata={'hpo_id': 'HP:0009738', 'lineage': 'HP:0000377->HP:0000356->HP:0031703->HP:0000598->HP:0000118'}, page_content='Anomalía del antihelix. Una anomalía de la antihélice.'),
 Document(id='HP:0010184', metadata={'hpo_id': 'HP:0010184', 'lineage': 'HP:0010161->HP:0001780->HP:0011

In [21]:
test_df = pd.read_csv("../../datasets/TFM_test.csv")
test_df.annotations = test_df.annotations.apply(eval)
clinical_note = test_df.texts.iloc[0]
hpo_codes = test_df.annotations.iloc[0]
codigos_reales = {doc.id:doc.page_content for doc in  vectordb.get_by_ids(set(hpo_codes))}

In [22]:
codigos_reales

{'HP:0001513': 'Obesidad. Acumulación de un exceso considerable de grasa corporal.',
 'HP:0004394': 'Pólipos gástricos múltiples.',
 'HP:0005227': 'Poliposis colónica adenomatosa. Presencia de múltiples pólipos adenomatosos en el colon.',
 'HP:0012183': 'Poliposis colónica hiperplásica. Presencia de múltiples pólipos hiperplásicos en el colon. Los pólipos hiperplásicos suelen tener un tamaño de unos 5 mm y muestran una proliferación hiperplásica de la mucosa.',
 'HP:0025501': 'Obesidad clase III. Obesidad con un índice de masa corporal de 40 kg por metro cuadrado o superior.',
 'HP:0031500': 'Masa abdominal. Agrandamiento o hinchazón anormal del abdomen.',
 'HP:0033769': 'Poliposis de las glándulas fúndicas. Múltiples pólipos en la mucosa secretora de ácido del cuerpo gástrico y el fundus. Los pólipos de la glándula fúndica (PGF) suelen tener un tamaño de 1 a 5 mm, aunque se han encontrado pólipos de mayor tamaño. Suelen ser sésiles, brillantes, translúcidos, de color pálido a rosado (