# Creating the evaluation Dataset with Langsmith

In [None]:
import os
import re 
import json
import importlib
import pandas as pd
from dotenv import load_dotenv
from langsmith import Client, evaluate
from phenopy.score import Scorer
from langsmith.schemas import Run, Example
from phenopy.build_hpo import generate_annotated_hpo_network
import sys
sys.path.append('../utils')
import customchain as cc
custom_chain = cc.custom_chain

In [26]:
importlib.reload(cc)
custom_chain = cc.custom_chain

In [2]:
load_dotenv(override=True)

True

Creación de los datasets de RAG-HPO y GSC

In [8]:
client = Client()

In [None]:
df = pd.read_csv('../../datasets/RAG-HPO/Test_Cases.csv')
df["annotations"] = df.annotations.apply(eval)
df = df.rename(columns={'esp':'clinical_note'})
input_keys = ['clinical_note']
output_keys = ['annotations'] 

dataset = client.upload_dataframe(
    df=df,
    input_keys=input_keys,
    output_keys=output_keys,
    name="RAG-HPO eval dataset",
    description="Dataset en español para la evaluación de herramientas de codificación fenotípica.",
    data_type="kv" # The default
)

In [36]:
RESOURCES_DIR="../../resources"
with open(os.path.join(RESOURCES_DIR, "hpo_es.json"), "r") as fp:
    hpo = json.load(fp)
valid_ids = [x['id'] for x in hpo]

In [None]:
def process_output(output):
    hpo_list = output["annotations"]
    hpo_list = [s.strip() for s in hpo_list]
    hpo_list = [j for j in hpo_list if re.compile(r"^HP:\d{7}$").match(j)]
    hpo_list = [i for i in hpo_list if i in valid_ids]
    return {"annotations":hpo_list}

In [39]:
examples = client.list_examples(dataset_name="RAG-HPO eval dataset")
ids = []
metadata = []
inputs = []
outputs = []
for example in examples:
    ids.append(example.id)
    inputs.append(example.inputs)
    outputs.append(process_output(example.outputs))

In [None]:
client.update_examples(
    dataset_name="RAG-HPO eval dataset",
    example_ids = ids,
    inputs = inputs, 
    outputs = outputs
    )

In [9]:
texts = []
annotations = []
for file in os.listdir("../../datasets/GCS+_ESP/Text"):
    with open(os.path.join("../../datasets/GCS+_ESP/Text", file), "r") as fp:
        texts.append(fp.read())
    annots = pd.read_csv(os.path.join("../../datasets/GSC+/Annotations", file), header=None, sep="\t")
    annots[1] = annots[1].apply(lambda x: x.split("|")[0].strip())
    annotations.append(annots[1].to_list())

In [11]:
gcs_esp = pd.DataFrame({"esp":texts, "annotations":annotations})

In [13]:
input_keys = ['esp']
output_keys = ['annotations'] 

dataset = client.upload_dataframe(
    df=gcs_esp,
    input_keys=input_keys,
    output_keys=output_keys,
    name="GSC eval dataset",
    description="Dataset en español para la evaluación de herramientas de codificación fenotípica.",
    data_type="kv" # The default
)

Definición de las métricas

In [39]:
def clean_final_answer(outputs):
    return [code.hpo_code.strip() for code in outputs["final answer"]]

In [3]:
phenopy_data_directory = "../../resources/"

# files used in building the annotated HPO network
obo_file = os.path.join(phenopy_data_directory, 'hp.obo')
disease_to_phenotype_file = os.path.join(phenopy_data_directory, 'phenotype.hpoa')

hpo_network, alt2prim, disease_records = \
    generate_annotated_hpo_network(obo_file,
                                   disease_to_phenotype_file)

scorer = Scorer(hpo_network)

In [44]:
# We can still pass in Run and Example objects if we'd like
def traditional_metrics(outputs: dict, reference_outputs: dict) -> list[dict]:
    """Check precision, recall and f1."""
    predicted_terms = clean_final_answer(outputs)
    real_terms = eval(reference_outputs["annotations"])
    precision = 0 if len(predicted_terms) == 0 else sum([int(term in real_terms) for term in predicted_terms]) / len(predicted_terms)
    recall = 0 if len(real_terms) == 0 else sum([int(term in predicted_terms) for term in real_terms]) / len(real_terms)
    f1 = 0 if (precision + recall) == 0 else round(2 * (precision * recall) / (precision + recall),2)

    return [
        {"key": "precision", "score": precision},
        {"key": "recall", "score": recall},
        {"key": "f1", "score": f1},
    ]

In [6]:
def semantic_similarity(outputs: dict, reference_outputs: dict)->float:
    """Check semantic similarity using phenopy."""
    predicted_terms = clean_final_answer(outputs)
    real_terms = eval(reference_outputs["annotations"])

    return {"key": "semantic similarity", "score": scorer.score_term_sets_basic(predicted_terms, real_terms)}

In [20]:
scorer.score_term_sets_basic(["HP:0033349"], ["HP:0001317"])

0.0

In [7]:
def jaccard_similarity(outputs: dict, reference_outputs: dict)->float:
    """Check Jaccard similarity between two sets."""
    predicted_terms = set(clean_final_answer(outputs))
    real_terms = set(eval(reference_outputs["annotations"]))    
    intersection = predicted_terms.intersection(real_terms)
    union = predicted_terms.union(real_terms)
    if not union:
        return 1.0  # define similarity as 1.0 when both are empty
    return len(intersection) / len(union)

In [None]:
def retrieve_accuracy(outputs: dict, reference_outputs: dict)->float:
    """Check retriever accuracy and recall."""
    predicted_candidates = set().union(*outputs["docs"])
    real_terms = set(eval(reference_outputs["annotations"]))  
    recall = len(real_terms & predicted_candidates) / len(real_terms)           # = 2/2 = 1.0
    precision = len(real_terms & predicted_candidates) / len(predicted_candidates)   

    return [{"key": "r_precision", "score": precision}, 
            {"key": "r_recall", "score": recall}]

In [None]:
results = evaluate(
    custom_chain,
    data="RAG-HPO eval dataset",
    evaluators=[traditional_metrics, semantic_similarity, jaccard_similarity, retrieve_accuracy]
)