In [49]:
# Imports
import os
import json
import re
from typing import Any, Dict, List, Tuple
from pathlib import Path
import pandas as pd

In [50]:
# Folder with the ground-truth texts
ANNOTATIONS_FOLDER = "../annotations/"

# List of entities to tag (by the llms) and then extract
TAGS = ["MOL", "SOFTNAME", "SOFTVERS", "STIME", "TEMP", "FFM"]

# Change this path to the results we want to score
QC_RESULTS_PATH = "../llm_outputs/stats_2025-04-28_15-06-21/quality_control_results.csv"

# Path to the where we will be writing the scoring results
SCORE_RESULTS_PATH = "../llm_outputs/stats_2025-04-28_15-06-21/scoring_results.csv"
SCORE_RESULTS_FOLDER = "../llm_outputs/stats_2025-04-28_15-06-21/"

## **LLM annotations scoring**
---

To assess the quality of entity annotations produced by different LLMs, we implement a set of evaluation metrics that allow both quantitative and qualitative analysis. We aim to measure how well each model performs in identifying and labeling entities.

However, before we can properly assess the quality of the annotations, we need to extarct the entities and store them in a standard structure.
Both the ground-truth entities and the llm-annotated entities will be in the following structure:

```json
{
  "MOL": ["arylamide", "hDM2", "p53", "Nutlin-2", "benzodiazepinedione"],
  "SOFTNAME": ["AutoDock"],
  "SOFTVERS": [],
  "STIME": ["20 ns"],
  "TEMP": [],
  "FFM": ["GAFF"]
}
```

For this, we need different helper functions that will: :

- convert the current **ground-truth annotation** format to the one we want


Current ground-truth annotation format:
```json
{
  "classes": ["TEMP", "SOFT", "STIME", "MOL", "FFM"],
  "annotations": [[
      "An in silico approach to determine inter-subunit affinities in human septin complexes.",
      {"entities": [[69, 75, "MOL"], [90, 97, "MOL"], [1255, 1260, "MOL"], [1368, 1374, "MOL"]]}
  ]]
}
```

- convert the **llm-ouput annotation** format to the one we want

llm-ouput annotation format:
```json
{
  "model": "gemma2-9b-it",
  "text_to_annotate": "Extending the Stochastic Titration CpHMD to CHARMM36m.",
  "response": "Extending the Stochastic Titration CpHMD to <FFM>CHARMM36m</FFM>."
}
```

In [51]:
# Extract entities from ground truth
def extract_entities_from_annotation(text: str, entities: list) -> dict:
    """
    Extract entities from the given text based on a direct list of annotation triples.

    The entities input should be a list of lists formatted as:
    [
        [start_index, end_index, "ENTITY_TYPE"],
        ...
    ]
    
    The function extracts the substring from the text using the provided character indices
    and groups the results by the entity type according to TAGS.
    If an entity type is not in TAGS, it will be ignored.
    If no entities are found for a type, its output list will remain empty.
    
    The function returns a dictionary with keys corresponding to the desired entity types
    and values as lists with the extracted entity content.
    """
    # Initialize the output dictionary with empty lists for each desired key.
    result = {key: [] for key in TAGS}
    
    # Iterate over each entity annotation.
    for start, end, entity_type in entities:
        if entity_type == 'SOFT':
            entity_type = 'SOFTNAME'

        extracted = text[start:end]
        result[entity_type].append(extracted)
    
    return result

# Extract entities from the LLM output text
def extract_entities_from_llm_text(text: str) -> dict:
    """
    Extract entities from an output text based on tagged annotations.
    
    The input text is expected to have entities enclosed in tags, e.g.:
    "Extending the Stochastic Titration CpHMD to <FFM>CHARMM36m</FFM> using <SOFTNAME>Gromacs</SOFTNAME>"
    
    The function returns a dictionary with keys corresponding to the desired entity types
    and values as lists with the extracted entity content.
    """
    # Initialize the results with empty lists for all desired keys.
    result = {key: [] for key in TAGS}
    
    # Use a regex to capture tags in the format <TAG>content</TAG>
    # The regex uses a backreference to ensure matching closing tag.
    pattern = re.compile(r"<([A-Z]+)>(.*?)</\1>")
    
    # Find all matches in the text.
    for tag, content in pattern.findall(text):
        # If the tag is one of our desired keys, append the content (stripped of whitespace)
        if tag in result:
            result[tag].append(content.strip())
    
    return result

Now onto the scoring. **Evaluation logic:**



1. **Exact match scoring**: Entity is correct if string and type match exactly.


2. **Confidence score**: Fraction of LLMs that agreed on the same entity. (!!! tricky because not all the llms will conserve the text) - ***Not added in yet***


3. **Detection ratio**: Correct entities found vs. total ground truth.


4. **False positives**: Entities predicted but not in ground truth.


5. **False negatives**: Ground truth entities missed by LLM.


6. **Per-type breakdown**: Scores computed by entity type.

In [52]:
# Calculate the exact match score
def exact_match_score(ground_truth: Dict[str, List[str]], predicted: Dict[str, List[str]]) -> Tuple[int, int, float]:
    """
    Computes the exact match score across all types.
    
    - An entity is an exact match if both its string and type match.
    - Returns a tuple of (matched_count, total_ground_truth_count, ratio).
    
    Parameters:
        ground_truth (dict): Ground truth annotations.
        predicted (dict): Predicted annotations.

    Returns:
        tuple: (number of exact matches, total ground truth entities, score ratio)
    """
    matched = 0
    total = 0
    # print(ground_truth.items())
    for entity_type, gt_entities in ground_truth.items():
        
        total += len(gt_entities)
        pred_entities = set(predicted.get(entity_type, []))
        
        # Count only those ground truth entities that appear exactly in the predictions.
        for entity in gt_entities:
            if entity in pred_entities:
                matched += 1
                
    score_ratio = matched / total if total > 0 else 0
    return matched, total, score_ratio


def detection_ratio(ground_truth: Dict[str, List[str]], predicted: Dict[str, List[str]]) -> Dict[str, float]:
    """
    Computes the detection ratio per entity type.
    
    - For each entity type, computes the fraction of ground truth entities that were found in the predicted entities.
    
    Parameters:
        ground_truth (dict): Ground truth annotations.
        predicted (dict): Predicted annotations.
    
    Returns:
        dict: Mapping from entity type to detection ratio (0 to 1).
    """
    ratios = {}
    for entity_type, gt_entities in ground_truth.items():
        pred_entities = set(predicted.get(entity_type, []))
        if gt_entities:
            detected = sum(1 for entity in gt_entities if entity in pred_entities)
            ratios[entity_type] = detected / len(gt_entities)
        else:
            ratios[entity_type] = None  # Undefined (or could be set to 0) if no ground truth for the type.
    return ratios


def false_positives(ground_truth: Dict[str, List[str]], predicted: Dict[str, List[str]]) -> Dict[str, List[str]]:
    """
    Computes false positive entities per entiy type.
    
    - False positive: An entity predicted that is not present in the corresponding ground truth.
    
    Parameters:
        ground_truth (dict): Ground truth annotations.
        predicted (dict): Predicted annotations.
    
    Returns:
        dict: Mapping from entity type to a list of false positive entities.
    """
    false_positives = {}
    for entity_type, pred_entities in predicted.items():
        gt_entities = set(ground_truth.get(entity_type, []))
        # Any predicted entity not in ground truth is a false positive.
        false_positives[entity_type] = [entity for entity in pred_entities if entity not in gt_entities]
    return false_positives


def false_negatives(ground_truth: Dict[str, List[str]], predicted: Dict[str, List[str]]) -> Dict[str, List[str]]:
    """
    Computes false negative entities per entity type.
    
    - False negative: A ground truth entity that was missed by prediction.
    
    Parameters:
        ground_truth (dict): Ground truth annotations.
        predicted (dict): Predicted annotations.
    
    Returns:
        dict: Mapping from entity type to a list of false negative entities.
    """
    false_negatives = {}
    for entity_type, gt_entities in ground_truth.items():
        pred_entities = set(predicted.get(entity_type, []))
        # Any ground truth entity not found in predictions is a false negative.
        false_negatives[entity_type] = [entity for entity in gt_entities if entity not in pred_entities]
    return false_negatives


def per_type_breakdown(ground_truth: Dict[str, List[str]], predicted: Dict[str, List[str]]) -> Dict[str, Dict[str, any]]:
    """
    Provides a detailed breakdown per entity type.
    
    For each entity type, returns a dict with:
      - 'exact_matches': number of exact matches,
      - 'total_gt': total number of ground truth entities,
      - 'detection_ratio': fraction of ground truth detected,
      - 'false_positives': list of false positive entities,
      - 'false_negatives': list of false negative entities.
    
    Parameters:
        ground_truth (dict): Ground truth annotations.
        predicted (dict): Predicted annotations.
    
    Returns:
        dict: Breakdown per entity type.
    """
    breakdown = {}
    for entity_type in set(ground_truth.keys()).union(set(predicted.keys())):
        gt_entities = ground_truth.get(entity_type, [])
        pred_entities = predicted.get(entity_type, [])
        gt_set = set(gt_entities)
        pred_set = set(pred_entities)
        
        exact_match_count = sum(1 for e in gt_entities if e in pred_set)
        total_gt = len(gt_entities)
        detection = exact_match_count / total_gt if total_gt > 0 else None
        
        breakdown[entity_type] = {
            'exact_matches': exact_match_count,
            'total_gt': total_gt,
            'detection_ratio': detection,
            'false_positives': len([e for e in pred_entities if e not in gt_set]),
            'false_negatives': len([e for e in gt_entities if e not in pred_set])
        }
        
    return breakdown


Now, we go looking for the texts to score.

We can extract them from a quality control (QC) summary (csv file). We just need to specify the file we are interested in, in order to then extract the texts that are worth scoring (those that have at least one entity that we know exists in the original text).

In [53]:
def extract_annotations_to_score(
    csv_file: str | Path,
) -> Tuple[pd.DataFrame, List[str], List[str]]:
    """
    Return the subset of rows with ``one_entity_verified == True`` and
    give back the two key columns as Python lists.

    Parameters
    ----------
    csv_file : str | Path
        Path to *filtering_results.csv*.

    Returns
    -------
    tuple
        (filtered_df,
         filenames,            # list[str]
         full_paths)           # list[str]
    """
    csv_file = Path(csv_file)

    df = pd.read_csv(csv_file)

    filtered = df[df["one_entity_verified"]]

    filenames  = filtered["filename"].tolist()
    full_paths = filtered["full_path"].tolist()

    return filtered[["prompt", "model", "filename", "full_path"]], filenames, full_paths

In [54]:
# Load the filtered results from the QC results summary
# and extract the filenames and full paths to the annotations
df, filenames, llm_filenames = extract_annotations_to_score(QC_RESULTS_PATH)

# View the first 5 rows
print(df.head())

# print(df.iloc[0, 0])  # Prompt
# print(df.iloc[0, 1])  # Model

# print(len(filenames)) # Names of the orginal files
# print(len(llm_filenames)) # Full paths to the annotated llm files

# Change filenames to the full path of the orignal ground truths
for i in range(len(filenames)):
    filenames[i] = os.path.join(ANNOTATIONS_FOLDER, filenames[i])

      prompt                                          model  \
0  zero_shot  meta-llama/llama-4-maverick-17b-128e-instruct   
1  zero_shot  meta-llama/llama-4-maverick-17b-128e-instruct   
2  zero_shot  meta-llama/llama-4-maverick-17b-128e-instruct   
3  zero_shot  meta-llama/llama-4-maverick-17b-128e-instruct   
4  zero_shot  meta-llama/llama-4-maverick-17b-128e-instruct   

                 filename                                          full_path  
0  figshare_22213635.json  ../llm_outputs/annotations_2025-04-28_15-06-21...  
1   figshare_4757161.json  ../llm_outputs/annotations_2025-04-28_15-06-21...  
2  figshare_21263177.json  ../llm_outputs/annotations_2025-04-28_15-06-21...  
3     zenodo_6582985.json  ../llm_outputs/annotations_2025-04-28_15-06-21...  
4     zenodo_6478270.json  ../llm_outputs/annotations_2025-04-28_15-06-21...  


In [55]:
# Process one JSON file to extract the ground truth entities and the input text
def process_json_file(json_file: str) -> tuple:
    with open(json_file, "r") as f:
        data = json.load(f)

    # Extract the input text
    annotation_entry = data["annotations"][0]
    input_text = annotation_entry[0]
    ground_truth_entities = annotation_entry[1]["entities"]

    return input_text, ground_truth_entities

# Process one LLM JSON file to extract the input text, response, and model
def process_llm_json_file(json_file: str) -> tuple:
    with open(json_file, "r") as f:
        data = json.load(f)

    # Extract the input text, response, and model
    text_to_annotate = data["text_to_annotate"]
    response = data["response"]
    model = data["model"]

    return text_to_annotate, response, model

# Saves the scoring results to a CSV file one row at a time
def save_scoring_results_to_csv(rows: List[Dict[str, Any]], output_dir: str | Path) -> None:
    """Append rows to filtering_results.csv inside output_dir.
    """
    output_dir = Path(output_dir)
    output_dir.mkdir(parents=True, exist_ok=True)

    csv_path = output_dir / "scoring_results.csv" # Name of the CSV file
    df = pd.DataFrame(rows)
    df.to_csv(csv_path, index=False, mode="a", header=not csv_path.exists())

### **Actual scoring and saving results:**

In [56]:
# Check if scoring file already exists. If it does, delete it.
summary_file_path = Path(SCORE_RESULTS_PATH)
if summary_file_path.exists():
    os.remove(summary_file_path)
    print(f"Overwriting existing file: {summary_file_path}\n\n")

# Loop through the filenames and process each one
for i in range(len(llm_filenames)):
    llm_filename = llm_filenames[i]
    gt_filename = filenames[i]

    # Collect rows in memory
    rows: List[Dict[str, Any]] = []

    # Process the ground-truth JSON file and extract entities
    input_text, ground_truth_entities = process_json_file(gt_filename)
    gt_extracted = extract_entities_from_annotation(input_text, ground_truth_entities)
    # print("Ground-truth entities:", gt_extracted)

    # Process the LLM JSON file and extract entities
    _, response, _ = process_llm_json_file(llm_filename)
    llm_extracted = extract_entities_from_llm_text(response)
    # print("LLM extracted entities:", llm_extracted)
    

    # Calculate the exact match score ========================================
    matched, total, score_ratio = exact_match_score(gt_extracted, llm_extracted)
    
    # False positives ========================================================
    fps = false_positives(gt_extracted, llm_extracted)
    joined_false_positives: dict[str, str] = {}
    
    # False negatives ========================================================
    fns = false_negatives(gt_extracted, llm_extracted)
    joined_false_negatives: dict[str, str] = {}
    
    # Calculate the detection ratio ==========================================
    detect_ratio = detection_ratio(gt_extracted, llm_extracted)

    # Per-type breakdown =====================================================
    breakdown = per_type_breakdown(gt_extracted, llm_extracted)
    stats_breakdown: dict[str, int] = {}


    # Print results ==========================================================
    print(llm_filename, "\n")
    print(f"★ Exact match score: {matched}/{total} ({score_ratio:.2f})")

    print("\n★ False positives (hallucination ?):")
    for etype, fp_list in fps.items():
        print(f"  {etype}: {fp_list}")

        joined_fp = "; ".join(map(str, fp_list))
        joined_false_positives[etype] = joined_fp

    print("\n★ False negatives (missed ?):")
    for etype, fn_list in fns.items():
        print(f"  {etype}: {fn_list}")
        
        joined_fn = "; ".join(map(str, fn_list))
        joined_false_negatives[etype] = joined_fn

    print("\n★ Detection ratio per type (# of correct entities found by LLM ÷ # of entities in the ground truth):")
    for etype, ratio in detect_ratio.items():
        print(f"  {etype}: {ratio}")

    print("\n★ Per-type breakdown:")
    for etype, stats in breakdown.items():
        print(f"  {etype}: {stats}")

        stats_breakdown[f"{etype}_correct"] = stats["exact_matches"]
        stats_breakdown[f"{etype}_total"] = stats["total_gt"]

    
    print("\n","="*200, "\n")

    # Save the results to a CSV file -----------------------------------

    # Prompt = 0
    # Model = 1
    # Filename = 2
    # Full path = 3

    prompt_name = df.iloc[i, 0]
    model = df.iloc[i, 1]
    filename = df.iloc[i, 2]
    file_path = df.iloc[i, 3]

    percentage_correct = round(score_ratio * 100, 2)

    rows.append(
        {
            "prompt": prompt_name,
            "model": model,
            "filename": filename,
            "percentage_correct": percentage_correct,
            "total_correct": matched,
            "total": total,
            "MOL_correct":stats_breakdown["MOL_correct"],
            "MOL_total":stats_breakdown["MOL_total"],
            "MOL_FP":joined_false_positives["MOL"],
            "MOL_FN":joined_false_negatives["MOL"],
            "SOFTNAME_correct":stats_breakdown["SOFTNAME_correct"],
            "SOFTNAME_total":stats_breakdown["SOFTNAME_total"],
            "SOFTNAME_FP":joined_false_positives["SOFTNAME"],
            "SOFTNAME_FN":joined_false_negatives["SOFTNAME"],
            "SOFTVERS_correct":stats_breakdown["SOFTVERS_correct"],
            "SOFTVERS_total":stats_breakdown["SOFTVERS_total"],
            "SOFTVERS_FP":joined_false_positives["SOFTVERS"],
            "SOFTVERS_FN":joined_false_negatives["SOFTVERS"],
            "STIME_correct":stats_breakdown["STIME_correct"],
            "STIME_total":stats_breakdown["STIME_total"],
            "STIME_FP":joined_false_positives["STIME"],
            "STIME_FN":joined_false_negatives["STIME"],
            "TEMP_correct":stats_breakdown["TEMP_correct"],
            "TEMP_total":stats_breakdown["TEMP_total"],
            "TEMP_FP":joined_false_positives["TEMP"],
            "TEMP_FN":joined_false_negatives["TEMP"],
            "FFM_correct":stats_breakdown["FFM_correct"],
            "FFM_total":stats_breakdown["FFM_total"],
            "FFM_FP":joined_false_positives["FFM"],
            "FFM_FN":joined_false_negatives["FFM"],
            "full path": str(file_path),
        }
    )

    output_path = SCORE_RESULTS_FOLDER
    save_scoring_results_to_csv(rows, output_path)

Overwriting existing file: ../llm_outputs/stats_2025-04-28_15-06-21/scoring_results.csv


../llm_outputs/annotations_2025-04-28_15-06-21/zero_shot/meta-llama/llama-4-maverick-17b-128e-instruct/figshare_22213635.json 

★ Exact match score: 9/10 (0.90)

★ False positives (hallucination ?):
  MOL: ['ammonia', 'NH3+ H2']
  SOFTNAME: ['MP2', 'cc-pVDZ', 'CCSD (T)', 'cc-pVTZ']
  SOFTVERS: []
  STIME: []
  TEMP: ['T130 K']
  FFM: []

★ False negatives (missed ?):
  MOL: []
  SOFTNAME: []
  SOFTVERS: []
  STIME: []
  TEMP: ['130 K']
  FFM: []

★ Detection ratio per type (# of correct entities found by LLM ÷ # of entities in the ground truth):
  MOL: 1.0
  SOFTNAME: None
  SOFTVERS: None
  STIME: None
  TEMP: 0.0
  FFM: None

★ Per-type breakdown:
  TEMP: {'exact_matches': 0, 'total_gt': 1, 'detection_ratio': 0.0, 'false_positives': 1, 'false_negatives': 1}
  MOL: {'exact_matches': 9, 'total_gt': 9, 'detection_ratio': 1.0, 'false_positives': 2, 'false_negatives': 0}
  FFM: {'exact_matches': 0, '