# Creating the evaluation Dataset with Langfuse

In [1]:
import os
import re 
from datetime import datetime
from datetime import datetime
import json
import importlib
import pandas as pd
from dotenv import load_dotenv
from langsmith import Client, evaluate
from phenopy.score import Scorer
from langsmith.schemas import Run, Example
from phenopy.build_hpo import generate_annotated_hpo_network
from concurrent.futures import ThreadPoolExecutor

load_dotenv()
import sys

sys.path.append(os.path.join(os.environ['PROJECT_DIR'], 'src'))

import utils.customchain as cc
custom_chain = cc.custom_chain

import utils.rawgptchain as rgc
rawgptchain = rgc.rawgptchain

2025-05-25 17:53:15,623 - phenopy - INFO - checking if config file exists: /Users/malenadiazrio/.phenopy
2025-05-25 17:53:15,624 - phenopy - INFO - phenopy 0.6.0
2025-05-25 17:53:15,624 - phenopy - INFO - Using configuration file: /Users/malenadiazrio/.phenopy/phenopy.ini
  from .autonotebook import tqdm as notebook_tqdm


In [2]:
importlib.reload(cc)
custom_chain = cc.custom_chain

In [3]:
load_dotenv(override=True)

True

Creación de los datasets de RAG-HPO y GSC

In [4]:
from langfuse import Langfuse
langfuse = Langfuse()

In [5]:
langfuse.create_dataset(
    name="RAGHPO",
    # optional description
    description="Dataset en español para la evaluación de herramientas de codificación fenotípica.",
    # optional metadata
    metadata={
        "author": "mdiazrio",
        "date": "2025-04-25",
        "type": "benchmark"
    }
)

Dataset(id='cm9x6ivv80096pf0604is8yzj', name='RAGHPO', description='Dataset en español para la evaluación de herramientas de codificación fenotípica.', metadata={'date': '2025-04-25', 'type': 'benchmark', 'author': 'mdiazrio'}, project_id='cm9vlvsif0006pf07xcychmbe', created_at=datetime.datetime(2025, 4, 25, 19, 23, 13, 748000, tzinfo=datetime.timezone.utc), updated_at=datetime.datetime(2025, 5, 25, 13, 3, 28, 726000, tzinfo=datetime.timezone.utc))

In [13]:
df = pd.read_csv('../../datasets/RAG-HPO/Test_Cases.csv')
df["annotations"] = df.annotations.apply(eval)
df = df.rename(columns={'esp':'clinical_note'})
input_keys = ['clinical_note']
output_keys = ['annotations'] 

In [14]:
RESOURCES_DIR="../../resources"
with open(os.path.join(RESOURCES_DIR, "hpo_es.json"), "r") as fp:
    hpo = json.load(fp)
valid_ids = [x['id'] for x in hpo]

In [18]:
def process_output(output):
    hpo_list = [s.strip() for s in output]
    hpo_list = [j for j in hpo_list if re.compile(r"^HP:\d{7}$").match(j)]
    hpo_list = [i for i in hpo_list if i in valid_ids]
    return {"annotations":hpo_list}

In [92]:
for idx, row in df.iterrows(): 
    langfuse.create_dataset_item(
        dataset_name="RAGHPO",
        input={"clinical_note": row['clinical_note']},
        expected_output=process_output(row['annotations'])
    )

GSCESP

In [121]:
langfuse.create_dataset(
    name="GSCESP",
    # optional description
    description="Dataset en español para la evaluación de herramientas de codificación fenotípica.",
    # optional metadata
    metadata={
        "author": "mdiazrio",
        "date": "2025-04-26",
        "type": "benchmark"
    }
)

Dataset(id='cm9xwlj4n015vpf06u8x2xkjn', name='GSCESP', description='Dataset en español para la evaluación de herramientas de codificación fenotípica.', metadata={'date': '2025-04-26', 'type': 'benchmark', 'author': 'mdiazrio'}, project_id='cm9vlvsif0006pf07xcychmbe', created_at=datetime.datetime(2025, 4, 26, 7, 33, 7, 223000, tzinfo=datetime.timezone.utc), updated_at=datetime.datetime(2025, 4, 26, 7, 33, 7, 223000, tzinfo=datetime.timezone.utc))

In [4]:
DATASET_DIR=os.path.join(os.environ['PROJECT_DIR'],"datasets/GSC+")
texts = []
annotations = []
for file in os.listdir(DATASET_DIR + "_ESP/Text"):
    with open(os.path.join(DATASET_DIR + "_ESP/Text", file), "r") as fp:
        texts.append(fp.read())
    annots = pd.read_csv(os.path.join(DATASET_DIR + "/Annotations", file), header=None, sep="\t")
    annots[1] = annots[1].apply(lambda x: x.split("|")[0].strip())
    annotations.append(annots[1].to_list())

In [5]:
gcs_esp = pd.DataFrame({"esp":texts, "annotations":annotations})
gcs_esp.rename(columns={"esp":"clinical_note"}, inplace=True)

In [9]:
gcs_esp.annotations.apply(lambda x: len(x)).mean()

12.162280701754385

In [1]:
import pickle as pkl
with open("/Users/malenadiazrio/Documents/UOC/TFM/TFM_2025_1/resources/Voyage Embeddings/embeddings_w_synonyms.pkl", "rb") as fp:
    embeddings = pkl.load(fp)

In [4]:
len(embeddings[0])

1024

In [5]:
def process_output(output):
    hpo_list = [s.strip() for s in output]
    hpo_list = [s.replace('_', ':') for s in hpo_list]
    hpo_list = [j for j in hpo_list if re.compile(r"^HP:\d{7}$").match(j)]
    hpo_list = [i for i in hpo_list if i in valid_ids]
    hpo_list = list(set(hpo_list))
    return {"annotations":hpo_list}

In [None]:
for idx, row in gcs_esp[0:5].iterrows(): 
    langfuse.create_dataset_item(
        dataset_name="Pruebas",
        input={"clinical_note": row['clinical_note']},
        expected_output=process_output(row['annotations'])
    )

Definición de las métricas

In [6]:
import re

def extract_hpo_code(text):
    # Check for full HPO code
    match = re.search(r'HP:\d{7}', text)
    if match:
        return match.group()
    
    # Check for 7-digit number
    match = re.search(r'\d{7}', text)
    if match:
        return f"HP:{match.group()}"
    
    # Nothing found
    return None

In [7]:
def clean_final_answer(outputs):
    try:
        clean_codes = [code.hpo_code.strip() for code in outputs["final_answer"] if getattr(code, "hpo_code", None) is not None]
        clean_codes = [extract_hpo_code(code) for code in clean_codes]
        clean_codes = [code for code in clean_codes if code is not None]
        return set(clean_codes)
    except:
        return outputs["final_answer"]

In [8]:
phenopy_data_directory = os.path.join(os.environ["PROJECT_DIR"],"./resources/")

# files used in building the annotated HPO network
obo_file = os.path.join(phenopy_data_directory, 'hp.obo')
disease_to_phenotype_file = os.path.join(phenopy_data_directory, 'phenotype.hpoa')

hpo_network, alt2prim, disease_records = \
    generate_annotated_hpo_network(obo_file,
                                   disease_to_phenotype_file)

scorer = Scorer(hpo_network)

In [9]:
# We can still pass in Run and Example objects if we'd like
def traditional_metrics(outputs: dict, reference_outputs: dict) -> list[dict]:
    """Check precision, recall and f1."""
    predicted_terms = clean_final_answer(outputs)
    real_terms = reference_outputs["annotations"]
    precision = 0 if len(predicted_terms) == 0 else sum([int(term in real_terms) for term in predicted_terms]) / len(predicted_terms)
    recall = 0 if len(real_terms) == 0 else sum([int(term in predicted_terms) for term in real_terms]) / len(real_terms)
    f1 = 0 if (precision + recall) == 0 else round(2 * (precision * recall) / (precision + recall),2)

    return [
        {"key": "precision", "score": precision},
        {"key": "recall", "score": recall},
        {"key": "f1", "score": f1},
    ]

In [10]:
def semantic_similarity(outputs: dict, reference_outputs: dict)->float:
    """Check semantic similarity using phenopy."""
    predicted_terms = clean_final_answer(outputs)
    real_terms = reference_outputs["annotations"]
    try:
        score = scorer.score_term_sets_basic(predicted_terms, real_terms)
    except:
        score = -1
    return [{"key": "semantic similarity", "score":score }]

In [11]:
def jaccard_similarity(outputs: dict, reference_outputs: dict)->float:
    """Check Jaccard similarity between two sets."""
    predicted_terms = set(clean_final_answer(outputs))
    real_terms = set(reference_outputs["annotations"])   
    intersection = predicted_terms.intersection(real_terms)
    union = predicted_terms.union(real_terms)
    if not union:
        return 1.0  # define similarity as 1.0 when both are empty
    return [{"key": "jaccard_similarity", "score": len(intersection) / len(union)}]

In [12]:
def retrieve_accuracy(outputs: dict, reference_outputs: dict)->float:
    """Check retriever accuracy and recall."""
    predicted_candidates = set().union(*outputs["docs"])
    real_terms = set(reference_outputs["annotations"])
    recall = len(real_terms & predicted_candidates) / len(real_terms)           # = 2/2 = 1.0
    precision = len(real_terms & predicted_candidates) / len(predicted_candidates)   

    return [{"key": "r_precision", "score": precision}, 
            {"key": "r_recall", "score": recall}]

In [13]:
importlib.reload(cc)
custom_chain = cc.custom_chain

In [14]:
MAX_WORKERS = 5
dataset = langfuse.get_dataset("GSCESP")
evaluators = [traditional_metrics, semantic_similarity, jaccard_similarity, retrieve_accuracy] 
run_name = "top10"

def process_item(item):
    try:
        handler = item.get_langchain_handler(run_name=run_name)
        response = custom_chain.with_config({ "callbacks": [handler]}).invoke(item.input)
        for evaluator in evaluators:
            scores = evaluator(response, item.expected_output)
            for score in scores:
                langfuse.score(trace_id=handler.get_trace_id(), name=score["key"], value=score["score"])
    except Exception as e:
        print(f"Error processing item: {e}")

with ThreadPoolExecutor(max_workers=MAX_WORKERS) as executor:
    executor.map(process_item, dataset.items)

# for item in dataset.items[0:20]:
#     process_item(item)


# Ensure all data is sent
langfuse.flush()

Error processing item: status_code: 500, body: {'message': 'Internal Server Error', 'error': 'An unknown error occurred'}
Error processing item: status_code: 500, body: {'message': 'Internal Server Error', 'error': 'An unknown error occurred'}
Error processing item: status_code: 500, body: {'message': 'Internal Server Error', 'error': 'An unknown error occurred'}
Error processing item: status_code: 500, body: {'message': 'Internal Server Error', 'error': 'An unknown error occurred'}


  avg = a.mean(axis, **keepdims_kw)
  ret = ret.dtype.type(ret / rcount)
  avg = a.mean(axis, **keepdims_kw)
  ret = ret.dtype.type(ret / rcount)
  avg = a.mean(axis, **keepdims_kw)
  ret = ret.dtype.type(ret / rcount)
  avg = a.mean(axis, **keepdims_kw)
  ret = ret.dtype.type(ret / rcount)


Error processing item: division by zero


  avg = a.mean(axis, **keepdims_kw)
  ret = ret.dtype.type(ret / rcount)
  avg = a.mean(axis, **keepdims_kw)
  ret = ret.dtype.type(ret / rcount)
  avg = a.mean(axis, **keepdims_kw)
  ret = ret.dtype.type(ret / rcount)


In [17]:
langfuse.flush()

Code to recompute scores

In [66]:
dataset = langfuse.get_dataset("RAGHPO")
expected_responses = {}
for item in dataset.items:
    expected_responses[item.id] = item.expected_output

In [67]:
def clean_final_answer(outputs):
    # try:
    clean_codes = [code["hpo_code"].strip() for code in outputs["final_answer"] if  code["hpo_code"] is not None]
    clean_codes = [extract_hpo_code(code) for code in clean_codes]
    clean_codes = [code for code in clean_codes if code is not None]
    return set(clean_codes)
    # except:
    #     return outputs["final_answer"]

In [68]:
def langfuse_run_ids(dataset_name, dataset_run_name):
    dataset_run = langfuse.get_dataset_run(
    dataset_name=dataset_name, dataset_run_name=dataset_run_name
    )
    trace_ids = [run_item.trace_id for run_item in dataset_run.dataset_run_items]
    return trace_ids

In [None]:
trace_ids = langfuse_run_ids("RAGHPO", "top10") #"2025-04-27_11-43-07"

In [70]:
def recompute_score(trace_id):
    trace = langfuse.fetch_trace(trace_id).data
    evaluators = [traditional_metrics, semantic_similarity, jaccard_similarity, retrieve_accuracy] 
    scores = []
    for evaluator in evaluators:
        scores += evaluator(trace.output,expected_responses[trace.metadata['dataset_item_id']])

    scores = {score['key']:score['score'] for score in scores}
    for score in trace.scores:
        if score.name in scores:
            langfuse.score(
                id = score.id,
                name = score.name,
                value= scores[score.name],
            )

In [72]:
MAX_WORKERS = 5
with ThreadPoolExecutor(max_workers=MAX_WORKERS) as executor:
    executor.map(recompute_score, trace_ids)