# Parsing Evaluation with Chroma

In [1]:
%load_ext autoreload
%autoreload 2

import pandas as pd
import numpy as np

from chunking_evaluation import GeneralEvaluation
from dotenv import load_dotenv
from chromadb.utils.embedding_functions import OpenAIEmbeddingFunction

from lib.evaluation.segmentation.chroma.chroma_adapter import ChromaChunker
from lib.segmentation.methods.implementations.fixed_sized import FixedSizeChunker
from lib.segmentation.methods.implementations.recursive import RecursiveChunker
from lib.segmentation.methods.implementations.hierarchical import HierarchicalChunker
from lib.segmentation.methods.implementations.semantic import SemanticChunker
from lib.parsing.methods.parsers import Parsers
from lib.utils.export_table import export_table_to_latex

  "Respond only with the IDs of the chunks where you believe a split should occur. YOU MUST RESPOND WITH AT LEAST ONE SPLIT. THESE SPLITS MUST BE IN ASCENDING ORDER AND EQUAL OR LARGER THAN: " + str(current_chunk)+"." + (f"\n\The previous response of {invalid_response} was invalid. DO NOT REPEAT THIS ARRAY OF NUMBERS. Please try again." if invalid_response else "")
2026-02-14 22:38:35,948 - INFO : Use pytorch device_name: mps
2026-02-14 22:38:35,948 - INFO : Load pretrained SentenceTransformer: all-MiniLM-L6-v2


In [5]:
chunkers = [
    FixedSizeChunker(max_tokens=128, overlap=0),
    # FixedSizeChunker(max_tokens=128, overlap=50),
    # FixedSizeChunker(max_tokens=256, overlap=0),
    # FixedSizeChunker(max_tokens=256, overlap=100),
    # FixedSizeChunker(max_tokens=512, overlap=0),
    # FixedSizeChunker(max_tokens=512, overlap=200),
    # FixedSizeChunker(max_tokens=1028, overlap=0),
    # FixedSizeChunker(max_tokens=1028, overlap=300),
    RecursiveChunker(max_tokens=128, overlap=0),
    # RecursiveChunker(max_tokens=128, overlap=50),
    # RecursiveChunker(max_tokens=256, overlap=0),
    # RecursiveChunker(max_tokens=256, overlap=100),
    # RecursiveChunker(max_tokens=512, overlap=0),
    # RecursiveChunker(max_tokens=512, overlap=200),
    # RecursiveChunker(max_tokens=1028, overlap=0),
    # RecursiveChunker(max_tokens=1028, overlap=300),
    SemanticChunker(max_tokens=128, similarity_threshold=95, min_tokens=64),
    # SemanticChunker(max_tokens=128, similarity_threshold=85, min_tokens=64),
    # SemanticChunker(max_tokens=256, similarity_threshold=95, min_tokens=128),
    # SemanticChunker(max_tokens=256, similarity_threshold=85, min_tokens=128),
    # SemanticChunker(max_tokens=512, similarity_threshold=85, min_tokens=256),
    # SemanticChunker(max_tokens=512, similarity_threshold=85, min_tokens=256),
    # SemanticChunker(max_tokens=1024, similarity_threshold=85, min_tokens=512),
    # SemanticChunker(max_tokens=1024, similarity_threshold=85, min_tokens=512),
    HierarchicalChunker(max_tokens=128, max_parent_token_ratio=0.4),
    # HierarchicalChunker(max_tokens=256, max_parent_token_ratio=0.4),
    # HierarchicalChunker(max_tokens=512, max_parent_token_ratio=0.4),
    # HierarchicalChunker(max_tokens=1024, max_parent_token_ratio=0.4),
]

In [6]:
chunkers = [
    ChromaChunker(c) for c in chunkers
]

In [8]:
# Make sure "OPENAI_API_KEY" is set in a .env file
load_dotenv()
embedding_function = OpenAIEmbeddingFunction(model_name="text-embedding-3-small")

## General Evaluation (predefined data sets)

In [None]:
evaluation = GeneralEvaluation()

results = []
pubmed_results = []

for chunker in chunkers:
    c_info = chunker.get_info()

    c_res = evaluation.run(chunker, embedding_function)
    pubmed_scores = c_res["corpora_scores"]["pubmed"]
    del c_res["corpora_scores"]

    # Add method info to all-corpus-mean scores
    c_res.update(c_info)
    results.append(c_res)

    # Get data for pubmed only
    c_row = {}
    for metric, scores in pubmed_scores.items():
        metric_name = metric.replace("_scores", "")
        c_row[f"{metric_name}_mean"] = np.mean(scores)
        c_row[f"{metric_name}_std"] = np.std(scores)

    c_row.update(c_info)
    pubmed_results.append(c_row)

In [None]:
pd.DataFrame(results)

In [None]:
pd.DataFrame(pubmed_results)

## Synthetic Evaluation

In [3]:
from lib.evaluation.segmentation.chroma.chroma_setup import setup_evaluation_from_medical_qas

synth_eval = setup_evaluation_from_medical_qas(
    Parsers.DOCLING,
    "awmf",
    parse_exist_ok=True,
    question_exist_ok=True
)

2026-02-14 22:45:52,036 - INFO : Parsing 187-050 using DOCLING...
2026-02-14 22:45:52,037 - INFO : detected formats: [<InputFormat.PDF: 'pdf'>]
2026-02-14 22:45:52,095 - INFO : Going to convert document batch...
2026-02-14 22:45:52,096 - INFO : Initializing pipeline for StandardPdfPipeline with options hash 2b180c1bcae7ecc36ce1042af7f80137
2026-02-14 22:45:52,122 - INFO : Loading plugin 'docling_defaults'
2026-02-14 22:45:52,124 - INFO : Registered picture descriptions: ['picture_description_vlm_engine', 'vlm', 'api']
2026-02-14 22:45:52,130 - INFO : Loading plugin 'docling_defaults'
2026-02-14 22:45:52,137 - INFO : Registered ocr engines: ['auto', 'easyocr', 'ocrmac', 'rapidocr', 'tesserocr', 'tesseract']
2026-02-14 22:45:52,589 - INFO : Auto OCR model selected ocrmac.
2026-02-14 22:45:52,594 - INFO : Loading plugin 'docling_defaults'
2026-02-14 22:45:52,598 - INFO : Registered layout engines: ['layout_object_detection', 'docling_layout_default', 'docling_experimental_table_crops_layo

0it [00:00, ?it/s]

2026-02-14 22:53:15,603 - ERROR : Could not find highlight in document corpus. Searched text: Ruhebeschwerden oder Beschwerden bei geringster
2026-02-14 22:53:15,607 - ERROR : Could not find highlight in document corpus. Searched text: 4 CCS 3 CCS 2 CCS 1 CCS
2026-02-14 22:53:15,814 - ERROR : Could not find highlight in document corpus. Searched text: Angina pectoris bei leichter körperlicher Belastung (normales Gehen, Ankleiden)
2026-02-14 22:53:16,343 - ERROR : Could not find highlight in document corpus. Searched text: Angina pectoris bei stärkerer Anstrengung (schnelles Laufen, Bergaufgehen, Treppensteigen nach dem Essen, bei Kälte, Wind oder psychischer Belastung)
2026-02-14 22:53:16,727 - ERROR : Could not find highlight in document corpus. Searched text: Keine Angina pectoris bei Alltagsbelastung (Laufen, Treppensteigen), jedoch bei plötzlicher oder längerer physischer Belastung
2026-02-14 22:53:25,147 - ERROR : Could not find highlight in document corpus. Searched text: Tabelle

In [9]:
results = []

for chunker in chunkers:
    c_info = chunker.get_info()
    print(c_info)

    c_res = synth_eval.run(chunker, embedding_function, retrieve=-1)
    del c_res["corpora_scores"]

    c_res.update(c_info)
    results.append(c_res)

{'Method': 'Fixed-Size', 'Param': '$O=0$', 'N': 128}


2026-02-14 22:57:35,446 - INFO : Chunking 065-003 using FIXED_SIZE...
Token indices sequence length is longer than the specified maximum sequence length for this model (579 > 512). Running this sequence through the model will result in indexing errors
2026-02-14 22:57:35,816 - INFO : Chunking 187-050 using FIXED_SIZE...
2026-02-14 22:57:36,200 - INFO : Chunking 045-015 using FIXED_SIZE...
2026-02-14 22:57:36,267 - INFO : Chunking 013-027 using FIXED_SIZE...
2026-02-14 22:57:37,009 - INFO : Chunking nvl-004 using FIXED_SIZE...
2026-02-14 22:57:37,380 - INFO : Chunking 187-019 using FIXED_SIZE...
2026-02-14 22:57:37,509 - INFO : Chunking 183-001 using FIXED_SIZE...
2026-02-14 22:57:37,940 - INFO : Chunking nvl-003 using FIXED_SIZE...
2026-02-14 22:57:38,160 - INFO : Chunking 187-039 using FIXED_SIZE...
2026-02-14 22:57:38,241 - INFO : Chunking 045-024 using FIXED_SIZE...
2026-02-14 22:57:38,346 - INFO : Chunking 045-028 using FIXED_SIZE...
2026-02-14 22:58:11,936 - INFO : HTTP Request: P

{'Method': 'Recursive', 'Param': '$O=0$', 'N': 128}


2026-02-14 22:59:09,395 - INFO : Chunking 065-003 using RECURSIVE...
2026-02-14 22:59:09,755 - INFO : Chunking 187-050 using RECURSIVE...
2026-02-14 22:59:10,157 - INFO : Chunking 045-015 using RECURSIVE...
2026-02-14 22:59:10,224 - INFO : Chunking 013-027 using RECURSIVE...
2026-02-14 22:59:10,767 - INFO : Chunking nvl-004 using RECURSIVE...
2026-02-14 22:59:11,156 - INFO : Chunking 187-019 using RECURSIVE...
2026-02-14 22:59:11,286 - INFO : Chunking 183-001 using RECURSIVE...
2026-02-14 22:59:11,734 - INFO : Chunking nvl-003 using RECURSIVE...
2026-02-14 22:59:11,961 - INFO : Chunking 187-039 using RECURSIVE...
2026-02-14 22:59:12,041 - INFO : Chunking 045-024 using RECURSIVE...
2026-02-14 22:59:12,151 - INFO : Chunking 045-028 using RECURSIVE...
2026-02-14 22:59:31,500 - INFO : HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
2026-02-14 22:59:32,523 - INFO : HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
2026-02-14 22:59:33,551 - I

{'Method': 'Semantic', 'Param': '$Q=95$', 'N': 128}


2026-02-14 23:00:42,209 - INFO : Chunking 065-003 using SEMANTIC...
2026-02-14 23:01:40,542 - INFO : Chunking 187-050 using SEMANTIC...
2026-02-14 23:02:51,160 - INFO : Chunking 045-015 using SEMANTIC...
2026-02-14 23:02:59,184 - INFO : Chunking 013-027 using SEMANTIC...
2026-02-14 23:03:52,211 - INFO : Chunking nvl-004 using SEMANTIC...
2026-02-14 23:05:03,246 - INFO : Chunking 187-019 using SEMANTIC...
2026-02-14 23:05:20,013 - INFO : Chunking 183-001 using SEMANTIC...
2026-02-14 23:06:54,200 - INFO : Chunking nvl-003 using SEMANTIC...
2026-02-14 23:07:35,029 - INFO : Chunking 187-039 using SEMANTIC...
2026-02-14 23:07:48,791 - INFO : Chunking 045-024 using SEMANTIC...
2026-02-14 23:08:07,478 - INFO : Chunking 045-028 using SEMANTIC...
2026-02-14 23:08:31,666 - INFO : HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
2026-02-14 23:08:32,742 - INFO : HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
2026-02-14 23:08:33,716 - INFO : HTTP 

{'Method': 'Hierarchical', 'Param': '$B_h=51$', 'N': 128}


2026-02-14 23:09:51,932 - INFO : Chunking 065-003 using HIERARCHICAL...
2026-02-14 23:09:52,950 - INFO : Chunking 187-050 using HIERARCHICAL...
2026-02-14 23:09:54,136 - INFO : Chunking 045-015 using HIERARCHICAL...
2026-02-14 23:09:54,277 - INFO : Chunking 013-027 using HIERARCHICAL...
2026-02-14 23:09:55,083 - INFO : Chunking nvl-004 using HIERARCHICAL...
2026-02-14 23:09:56,223 - INFO : Chunking 187-019 using HIERARCHICAL...
2026-02-14 23:09:56,751 - INFO : Chunking 183-001 using HIERARCHICAL...
2026-02-14 23:09:58,001 - INFO : Chunking nvl-003 using HIERARCHICAL...
2026-02-14 23:09:58,634 - INFO : Chunking 187-039 using HIERARCHICAL...
2026-02-14 23:09:58,821 - INFO : Chunking 045-024 using HIERARCHICAL...
2026-02-14 23:09:59,197 - INFO : Chunking 045-028 using HIERARCHICAL...
2026-02-14 23:15:00,686 - INFO : HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
2026-02-14 23:15:02,630 - INFO : HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 20

In [10]:
res_df = pd.DataFrame(results)
res_df.set_index(["Method", "Param"], inplace=True)
res_df.index.names = [None, None]
res_df

Unnamed: 0,Unnamed: 1,iou_mean,iou_std,recall_mean,recall_std,precision_omega_mean,precision_omega_std,precision_mean,precision_std,N
Fixed-Size,$O=0$,0.101169,0.108062,0.2866,0.288482,0.422392,0.11136,0.123153,0.116951,128
Recursive,$O=0$,0.120452,0.15426,0.262148,0.28318,0.591056,0.145471,0.157335,0.16973,128
Semantic,$Q=95$,0.120546,0.152881,0.266085,0.283141,0.591215,0.152295,0.15732,0.167997,128
Hierarchical,$B_h=51$,0.16311,0.196354,0.320684,0.282656,0.617073,0.165706,0.213247,0.216891,128


In [11]:
def combine_cols(df, metric_name: str):
    mean_col = f"{metric_name}_mean"
    std_col = f"{metric_name}_std"

    sorted_mean = df[mean_col].round(2).unique()
    sorted_mean.sort()

    max_val = sorted_mean[-1]
    second_val = sorted_mean[-2]

    def highlight_str(row):
        mean = round(row[mean_col], 2)
        std = round(row[std_col], 2)
        cell = fr"{mean:.2f} \pm {std:.2f}"

        if mean == max_val:
            return fr"$\mathbf{{{cell}}}$"
        elif mean == second_val:
            return fr"\underline{{${cell}$}}"
        else:
            return f"${cell}$"

    df[metric_name.capitalize()] = df.apply(highlight_str, axis=1)

In [12]:
combined_df = res_df.copy()

combine_cols(combined_df, "iou")
combine_cols(combined_df, "precision")
combine_cols(combined_df, "recall")
combine_cols(combined_df, "precision_omega")

combined_df = combined_df[["N", "Iou", "Recall", "Precision", "Precision_omega"]]
combined_df = combined_df.rename(columns={
    "Iou": "IoU",
    "Precision_omega": r"$\text{Precision}_\Omega$"
})

export_table_to_latex(
    combined_df,
    name="chroma_results",
    column_format="llccccc",
    escape_latex=False,
    highlight_mode=None,
)

2026-02-14 23:17:43,256 - INFO : Saved table content to: /Users/matteo/Uni/7.Semester/thesis/visual-chunking/thesis/figures/tables/chroma_results.tex
