# Creating the evaluation Dataset with Langsmith

In [None]:
import os
import re 
import json
import importlib
import pandas as pd
from dotenv import load_dotenv
from langsmith import Client, evaluate
from phenopy.score import Scorer
from langsmith.schemas import Run, Example
from phenopy.build_hpo import generate_annotated_hpo_network
import sys
sys.path.append('../utils')
import customchain as cc
custom_chain = cc.custom_chain

In [26]:
importlib.reload(cc)
custom_chain = cc.custom_chain

In [2]:
load_dotenv(override=True)

True

Creación de los datasets de RAG-HPO y GSC

In [8]:
client = Client()

In [None]:
df = pd.read_csv('../../datasets/RAG-HPO/Test_Cases.csv')
df["annotations"] = df.annotations.apply(eval)
df = df.rename(columns={'esp':'clinical_note'})
input_keys = ['clinical_note']
output_keys = ['annotations'] 

dataset = client.upload_dataframe(
    df=df,
    input_keys=input_keys,
    output_keys=output_keys,
    name="RAG-HPO eval dataset",
    description="Dataset en español para la evaluación de herramientas de codificación fenotípica.",
    data_type="kv" # The default
)

In [36]:
RESOURCES_DIR="../../resources"
with open(os.path.join(RESOURCES_DIR, "hpo_es.json"), "r") as fp:
    hpo = json.load(fp)
valid_ids = [x['id'] for x in hpo]

In [None]:
def process_output(output):
    hpo_list = output["annotations"]
    hpo_list = [s.strip() for s in hpo_list]
    hpo_list = [j for j in hpo_list if re.compile(r"^HP:\d{7}$").match(j)]
    hpo_list = [i for i in hpo_list if i in valid_ids]
    return {"annotations":hpo_list}

In [39]:
examples = client.list_examples(dataset_name="RAG-HPO eval dataset")
ids = []
metadata = []
inputs = []
outputs = []
for example in examples:
    ids.append(example.id)
    inputs.append(example.inputs)
    outputs.append(process_output(example.outputs))

In [42]:
client.update_examples(
    dataset_name="RAG-HPO eval dataset",
    example_ids = ids,
    inputs = inputs, 
    outputs = outputs
    )

{'message': '116 examples updated',
 'example_ids': ['1a31bc86-a17e-4418-b3b4-6579196276e1',
  'd13a3ff3-2648-40b5-abe7-bf6d293b2cc8',
  'f19e2e38-e276-4f58-8508-359a659b5627',
  'af36f449-a848-4b5f-b9dc-4bae9629d2b5',
  'afd089c8-5bcb-4f8f-a14b-0d4eea654e0f',
  'bee473dd-6c83-4252-a51d-93679494c278',
  'fdcb4fa3-05dc-4bf6-8bbf-7b6d66465cf4',
  '2e6b74d0-35e6-45d6-83dd-b293615d85f1',
  '7a4edacb-b36a-4d93-9ef5-8703c8d166a0',
  '7faf2fb6-33ab-4088-adb6-605c0b1b3b4c',
  'a0975ca7-2f93-4d88-abfb-66b79014f06b',
  'a5270446-3024-4136-949b-dc5413854718',
  'a58f8dcc-b7ff-43e5-b38f-ee4a11c3cbe9',
  '512b9043-57c0-43ee-ba35-77eeec30e534',
  '7a0f7dec-d7cd-4c0c-8933-ced77b7f4cf8',
  '7f8644d2-54cb-4cfd-8462-f9f96dd09559',
  '78f7deb4-368a-49af-af0a-ee97b7113e71',
  '9ab0333b-c04d-4a19-a846-376864f069fd',
  'd0ce413e-db89-4eb6-b6bb-831abd03e62a',
  '256756a1-a92a-4946-87bb-ffb8333cf686',
  '36dcb0ab-aed4-4608-bd68-f63ee73edbd4',
  '54c6fd9d-fe8a-440b-b02e-68742fb13b7f',
  '3ff386a6-24c0-4c26-9b5

In [9]:
texts = []
annotations = []
for file in os.listdir("../../datasets/GCS+_ESP/Text"):
    with open(os.path.join("../../datasets/GCS+_ESP/Text", file), "r") as fp:
        texts.append(fp.read())
    annots = pd.read_csv(os.path.join("../../datasets/GSC+/Annotations", file), header=None, sep="\t")
    annots[1] = annots[1].apply(lambda x: x.split("|")[0].strip())
    annotations.append(annots[1].to_list())

In [11]:
gcs_esp = pd.DataFrame({"esp":texts, "annotations":annotations})

In [13]:
input_keys = ['esp']
output_keys = ['annotations'] 

dataset = client.upload_dataframe(
    df=gcs_esp,
    input_keys=input_keys,
    output_keys=output_keys,
    name="GSC eval dataset",
    description="Dataset en español para la evaluación de herramientas de codificación fenotípica.",
    data_type="kv" # The default
)

Definición de las métricas

In [39]:
def clean_final_answer(outputs):
    return [code.hpo_code.strip() for code in outputs["final answer"]]

In [3]:
phenopy_data_directory = "../../resources/"

# files used in building the annotated HPO network
obo_file = os.path.join(phenopy_data_directory, 'hp.obo')
disease_to_phenotype_file = os.path.join(phenopy_data_directory, 'phenotype.hpoa')

hpo_network, alt2prim, disease_records = \
    generate_annotated_hpo_network(obo_file,
                                   disease_to_phenotype_file)

scorer = Scorer(hpo_network)

In [44]:
# We can still pass in Run and Example objects if we'd like
def traditional_metrics(outputs: dict, reference_outputs: dict) -> list[dict]:
    """Check precision, recall and f1."""
    predicted_terms = clean_final_answer(outputs)
    real_terms = eval(reference_outputs["annotations"])
    precision = 0 if len(predicted_terms) == 0 else sum([int(term in real_terms) for term in predicted_terms]) / len(predicted_terms)
    recall = 0 if len(real_terms) == 0 else sum([int(term in predicted_terms) for term in real_terms]) / len(real_terms)
    f1 = 0 if (precision + recall) == 0 else round(2 * (precision * recall) / (precision + recall),2)

    return [
        {"key": "precision", "score": precision},
        {"key": "recall", "score": recall},
        {"key": "f1", "score": f1},
    ]

In [6]:
def semantic_similarity(outputs: dict, reference_outputs: dict)->float:
    """Check semantic similarity using phenopy."""
    predicted_terms = clean_final_answer(outputs)
    real_terms = eval(reference_outputs["annotations"])

    return {"key": "semantic similarity", "score": scorer.score_term_sets_basic(predicted_terms, real_terms)}

In [20]:
scorer.score_term_sets_basic(["HP:0033349"], ["HP:0001317"])

0.0

In [7]:
def jaccard_similarity(outputs: dict, reference_outputs: dict)->float:
    """Check Jaccard similarity between two sets."""
    predicted_terms = set(clean_final_answer(outputs))
    real_terms = set(eval(reference_outputs["annotations"]))    
    intersection = predicted_terms.intersection(real_terms)
    union = predicted_terms.union(real_terms)
    if not union:
        return 1.0  # define similarity as 1.0 when both are empty
    return len(intersection) / len(union)

In [None]:
def retrieve_accuracy(outputs: dict, reference_outputs: dict)->float:
    """Check retriever accuracy and recall."""
    predicted_candidates = set().union(*outputs["docs"])
    real_terms = set(eval(reference_outputs["annotations"]))  
    recall = len(real_terms & predicted_candidates) / len(real_terms)           # = 2/2 = 1.0
    precision = len(real_terms & predicted_candidates) / len(predicted_candidates)   

    return [{"key": "r_precision", "score": precision}, 
            {"key": "r_recall", "score": recall}]

In [None]:
results = evaluate(
    custom_chain,
    data="RAG-HPO eval dataset",
    evaluators=[traditional_metrics, semantic_similarity, jaccard_similarity, retrieve_accuracy]
)

View the evaluation results for experiment: 'extraneous-stretch-87' at:
https://smith.langchain.com/o/69013edf-2d4b-41cf-a1be-5c1e6977330d/datasets/b39bf604-8d88-4bd8-8a50-e98e9b5f7138/compare?selectedSessions=52de0f30-c683-4232-9b39-29b58293818e




0it [00:00, ?it/s]Error running evaluator <DynamicRunEvaluator semantic_similarity> on run 22202133-14b7-4acf-8d84-f8a65cb27423: KeyError('HP:6000040')
Traceback (most recent call last):
  File "/Users/malenadiazrio/Documents/UOC/TFM/TFM_2025_1/.venv/lib/python3.11/site-packages/langsmith/evaluation/_runner.py", line 1634, in _run_evaluators
    evaluator_response = evaluator.evaluate_run(  # type: ignore[call-arg]
                         ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Users/malenadiazrio/Documents/UOC/TFM/TFM_2025_1/.venv/lib/python3.11/site-packages/langsmith/evaluation/evaluator.py", line 346, in evaluate_run
    result = self.func(
             ^^^^^^^^^^
  File "/Users/malenadiazrio/Documents/UOC/TFM/TFM_2025_1/.venv/lib/python3.11/site-packages/langsmith/evaluation/evaluator.py", line 744, in wrapper
    return func(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^
  File "/var/folders/d6/gvk9_zmj13704s2_fj12rw680000gn/T/ipykernel_66703/1193214177.py"

In [20]:
from langchain_chroma import Chroma
from langchain_voyageai import VoyageAIEmbeddings
embeddings_model = VoyageAIEmbeddings(model="voyage-3")
vectordb = Chroma(persist_directory="../../chroma_db/Voyage3", embedding_function=embeddings_model, 
                  collection_name="hpo_ontology_esp")
retriever = vectordb.as_retriever(search_kwargs={"k": 10})

In [21]:
test_df = pd.read_csv("../../datasets/TFM_test.csv")
test_df.annotations = test_df.annotations.apply(eval)
clinical_note = test_df.texts.iloc[0]
hpo_codes = test_df.annotations.iloc[0]
codigos_reales = {doc.id:doc.page_content for doc in  vectordb.get_by_ids(set(hpo_codes))}

In [22]:
codigos_reales

{'HP:0001513': 'Obesidad. Acumulación de un exceso considerable de grasa corporal.',
 'HP:0004394': 'Pólipos gástricos múltiples.',
 'HP:0005227': 'Poliposis colónica adenomatosa. Presencia de múltiples pólipos adenomatosos en el colon.',
 'HP:0012183': 'Poliposis colónica hiperplásica. Presencia de múltiples pólipos hiperplásicos en el colon. Los pólipos hiperplásicos suelen tener un tamaño de unos 5 mm y muestran una proliferación hiperplásica de la mucosa.',
 'HP:0025501': 'Obesidad clase III. Obesidad con un índice de masa corporal de 40 kg por metro cuadrado o superior.',
 'HP:0031500': 'Masa abdominal. Agrandamiento o hinchazón anormal del abdomen.',
 'HP:0033769': 'Poliposis de las glándulas fúndicas. Múltiples pólipos en la mucosa secretora de ácido del cuerpo gástrico y el fundus. Los pólipos de la glándula fúndica (PGF) suelen tener un tamaño de 1 a 5 mm, aunque se han encontrado pólipos de mayor tamaño. Suelen ser sésiles, brillantes, translúcidos, de color pálido a rosado (