In [70]:
# Imports
import os
import json
import re
from typing import Any, Dict, List
from pathlib import Path
import pandas as pd

In [71]:
date_time_str = "2025-04-28_16-08-34"

# Folder with the llm annotations
LLM_ANNOTATIONS = f"../llm_outputs/annotations_{date_time_str}/"

# List of entities to tag (by the llms) and then extract
TAGS = ["MOL", "SOFTNAME", "SOFTVERS", "STIME", "TEMP", "FFM"]

# Path to the where we will be writing the quality control results
QC_RESULTS_PATH = f"../llm_outputs/stats_{date_time_str}/quality_control_results.csv"
QC_RESULTS_FOLDER = f"../llm_outputs/stats_{date_time_str}/"

## **Quality control**
---

Before scoring the LLM responses, we implement a few functions to perform a basic quality check. This check helps ensure that the outputs meet a minimum standard before moving forward with evaluation or further processing.

For now, our quality control focuses on:
1. verifying that the LLM response includes the original input text
2. checking if at least one entitiy found by the llm exists in the original text. 


This simple check helps catch hallucinations or unrelated outputs from the model. Note that the comparison is **case-insensitive**.

This is by no means a filter to the responses, but rather will help up filter later on, but also will give help us give an idea of the degree of liberty that the LLM takes to answer our query.

To ensure the integrity of the LLM annotations, we need to verify that the output text closely matches the input text we provided.

1. **Strip annotation tags**

Since the LLM response includes XML-like tags (e.g., `<TAG LABEL>` and `</TAG LABEL>`), we first remove these tags to isolate the raw text. This allows for a fair comparison between the original input and the annotated output.

2. **Compare input and output texts**

Once tags are stripped, we compare the cleaned LLM output to the original input text. This helps us confirm that the model has not introduced hallucinated content or omitted any part of the input. Only the responses that **pass this check** are retained for further analysis or evaluation.



In [72]:
# Function to strip tags from the annotated text
def strip_tags(text:str, tags=TAGS) -> str:
    """
    Removes the custom tags from the annotated text.
    """
    for tag in tags:
        text = re.sub(f"</?{re.escape(tag)}>", "", text)
    return text.strip()

# Function to compare the annotated text to the original input
def compare_annotated_to_original(original: str, annotated: str) -> bool:
    """
    Compares tag-stripped annotated text to the original input in lowercase.
    Returns True if they match exactly (ignoring case), False otherwise.
    """
    stripped = strip_tags(annotated).strip().lower()
    original = original.strip().lower()

    return stripped == original

3. **Check if at least one entity found, is present in the original text**

For this step, we need to first of all load the llm json file in order to be able to read and analyse the model response. From the LLM response, we are going to need to extract the entities found in the XML-like tags. Then we go through the entities, and search to find if at least one entity is present in the ground truth.

In [73]:
def process_llm_json_file(json_file: str) -> tuple:
    with open(json_file, "r") as f:
        data = json.load(f)

    # Extract the input text, response, and model
    text_to_annotate = data["text_to_annotate"]
    response = data["response"]
    model = data["model"]

    return text_to_annotate, response, model

def extract_entities_from_llm_text(text: str) -> dict:
    """
    Extract entities from an output text based on tagged annotations.
    
    The input text is expected to have entities enclosed in tags, e.g.:
    "Extending the Stochastic Titration CpHMD to <FFM>CHARMM36m</FFM> using <SOFTNAME>Gromacs</SOFTNAME>"
    
    The function returns a dictionary with keys corresponding to the desired entity types
    and values as lists with the extracted entity content.
    """
    # Initialize the results with empty lists for all desired keys.
    result = {key: [] for key in TAGS}
    
    # Use a regex to capture tags in the format <TAG>content</TAG>
    # The regex uses a backreference to ensure matching closing tag.
    pattern = re.compile(r"<([A-Z]+)>(.*?)</\1>")
    
    # Find all matches in the text.
    for tag, content in pattern.findall(text):
        # If the tag is one of our desired keys, append the content (stripped of whitespace)
        if tag in result:
            result[tag].append(content.strip())
    
    return result

def find_one_valid_llm_entity(llm_entities:dict, input_text:str) -> bool:
    """
    Check if at least one LLM entity is found in the input text.
    
    Args:
        llm_entities (dict): Dictionary of LLM entities.
        input_text (str): The original input text.
    
    Returns:
        bool: True if at least one entity is found, False otherwise.
    """
    # Check if any LLM entities are present in the input text
    # Iterate through the dictionary of LLM entities
    # and check if any value is present in the input text
    for tag, values in llm_entities.items():
        # rint(f"Checking for tag: {tag} with values: {values}")
        for value in values:
            # print(f"Checking if '{value}' is in input text.")
            if value in input_text:
                return True
    return False

We're going to want to save the results of our quality control, to then manipulate later when we score, or if we want to visulise them through plots and graphs.

In [74]:
def save_qc_results_to_csv(rows: List[Dict[str, Any]], output_dir: str | Path) -> None:
    """Append rows to quality_control_results.csv inside output_dir.

    Each row dict must have the keys
    prompt, model, filename, text_unchanged, one_entity_verified, and full_path.
    """
    output_dir = Path(output_dir)
    output_dir.mkdir(parents=True, exist_ok=True)

    csv_path = output_dir / "quality_control_results.csv"
    df = pd.DataFrame(rows)
    df.to_csv(csv_path, index=False, mode="a", header=not csv_path.exists())


This is the function responsible for the quality control of the LLM outputs.


In [75]:
def quality_control(path_to_test: str | Path) -> None:
    """Check the quality of the responses and write one summary row per file.

    Parameters
    ----------
    path_to_test: str | Path
        Path to the folder containing the llm responses.

    """

    # Check if quality_control_results.csv already exists
    results_file = Path(QC_RESULTS_PATH)
    if results_file.exists():
        os.remove(results_file)
        print(f"Overwriting existing file: {results_file}\n\n")


    path_to_test = Path(path_to_test)

    # Collect rows in memory
    rows: List[Dict[str, Any]] = []

    for prompt in os.listdir(path_to_test):  # ─── prompt loop ────────────────────────────
        prompt_name = Path(prompt).name
        prompt_folder = path_to_test / prompt_name

        prompt_total = prompt_conserved = prompt_modified = 0

        for model in os.listdir(prompt_folder):  # ─── model loop ───────────────────────────
            # If model is a "meta-llama" model, go into the directory to get the full model name
            # (e.g. "meta-llama/llama-4-maverick-17b-128e-instruct")
            # Otherwise, use the model name directly
            # (e.g. "gemma2-9b-it")
            if model.startswith("meta-llama"):
                model_path = prompt_folder / model # Path
                subdirs = [dir.name for dir in model_path.iterdir() if dir.is_dir()]

                if len(subdirs) > 1:
                    print(f"Warning: multiple models found in {model_path}")
                else:
                    only_model = subdirs[0] # still a string
                    model_folder = model_path / only_model  # Path / str → Path
                    model = f"{model}/{only_model}" # full name for later use
            else:
                model_folder = prompt_folder / model # Path / str → Path


            for filename in os.listdir(model_folder):  # ─ file loop ─────────
                prompt_total += 1
                file_path = model_folder / filename

                # ------------------------------------------------------------------
                # Supply your own helpers for these next three calls
                # ------------------------------------------------------------------
                input_text, response, _ = process_llm_json_file(file_path)
                llm_entities = extract_entities_from_llm_text(response)

                exact_text_result = compare_annotated_to_original(input_text, response)
                entities_result = find_one_valid_llm_entity(llm_entities, input_text)
                # ------------------------------------------------------------------

                rows.append(
                    {
                        "prompt": prompt_name,
                        "model": model,
                        "filename": filename,
                        "text_unchanged": exact_text_result,
                        "one_entity_verified": entities_result,
                        "full_path": str(file_path),
                    }
                )

    # Persist everything in one write
    save_qc_results_to_csv(rows, QC_RESULTS_FOLDER)

### **Actual quality control and saving results:**

In [76]:
quality_control(LLM_ANNOTATIONS)