# Parsing Evaluation with Chroma

In [1]:
%load_ext autoreload
%autoreload 2

import pandas as pd
import numpy as np

from chunking_evaluation import GeneralEvaluation
from dotenv import load_dotenv
from chromadb.utils.embedding_functions import OpenAIEmbeddingFunction

from lib.evaluation.segmentation.chroma.chroma_adapter import ChromaChunker
from lib.segmentation.methods.implementations.fixed_sized import FixedSizeChunker
from lib.segmentation.methods.implementations.recursive import RecursiveChunker
from lib.segmentation.methods.implementations.hierarchical import HierarchicalChunker
from lib.segmentation.methods.implementations.semantic import SemanticChunker
from lib.parsing.methods.parsers import Parsers
from lib.utils.export_table import export_table_to_latex

2026-02-12 16:27:48,291 - INFO : Use pytorch device_name: mps
2026-02-12 16:27:48,291 - INFO : Load pretrained SentenceTransformer: all-MiniLM-L6-v2


In [2]:
chunkers = [
    FixedSizeChunker(max_tokens=128, overlap=0),
    # FixedSizeChunker(max_tokens=128, overlap=50),
    # FixedSizeChunker(max_tokens=256, overlap=0),
    # FixedSizeChunker(max_tokens=256, overlap=100),
    # FixedSizeChunker(max_tokens=512, overlap=0),
    # FixedSizeChunker(max_tokens=512, overlap=200),
    # FixedSizeChunker(max_tokens=1028, overlap=0),
    # FixedSizeChunker(max_tokens=1028, overlap=300),
    RecursiveChunker(max_tokens=128, overlap=0),
    # RecursiveChunker(max_tokens=128, overlap=50),
    # RecursiveChunker(max_tokens=256, overlap=0),
    # RecursiveChunker(max_tokens=256, overlap=100),
    # RecursiveChunker(max_tokens=512, overlap=0),
    # RecursiveChunker(max_tokens=512, overlap=200),
    # RecursiveChunker(max_tokens=1028, overlap=0),
    # RecursiveChunker(max_tokens=1028, overlap=300),
    SemanticChunker(max_tokens=128, similarity_threshold=95, min_tokens=64),
    # SemanticChunker(max_tokens=128, similarity_threshold=85, min_tokens=64),
    # SemanticChunker(max_tokens=256, similarity_threshold=95, min_tokens=128),
    # SemanticChunker(max_tokens=256, similarity_threshold=85, min_tokens=128),
    # SemanticChunker(max_tokens=512, similarity_threshold=85, min_tokens=256),
    # SemanticChunker(max_tokens=512, similarity_threshold=85, min_tokens=256),
    # SemanticChunker(max_tokens=1024, similarity_threshold=85, min_tokens=512),
    # SemanticChunker(max_tokens=1024, similarity_threshold=85, min_tokens=512),
    HierarchicalChunker(max_tokens=128, max_parent_token_ratio=0.4),
    # HierarchicalChunker(max_tokens=256, max_parent_token_ratio=0.4),
    # HierarchicalChunker(max_tokens=512, max_parent_token_ratio=0.4),
    # HierarchicalChunker(max_tokens=1024, max_parent_token_ratio=0.4),
]

In [3]:
chunkers = [
    ChromaChunker(c) for c in chunkers
]

In [4]:
# Make sure "OPENAI_API_KEY" is set in a .env file
load_dotenv()
embedding_function = OpenAIEmbeddingFunction(model_name="text-embedding-3-small")

## General Evaluation (predefined data sets)

In [33]:
evaluation = GeneralEvaluation()

results = []
pubmed_results = []

for chunker in chunkers:
    c_info = chunker.get_info()

    c_res = evaluation.run(chunker, embedding_function)
    pubmed_scores = c_res["corpora_scores"]["pubmed"]
    del c_res["corpora_scores"]

    # Add method info to all-corpus-mean scores
    c_res.update(c_info)
    results.append(c_res)

    # Get data for pubmed only
    c_row = {}
    for metric, scores in pubmed_scores.items():
        metric_name = metric.replace("_scores", "")
        c_row[f"{metric_name}_mean"] = np.mean(scores)
        c_row[f"{metric_name}_std"] = np.std(scores)

    c_row.update(c_info)
    pubmed_results.append(c_row)

2026-02-12 19:33:11,243 - INFO : Anonymized telemetry enabled. See                     https://docs.trychroma.com/telemetry for more information.
2026-02-12 19:33:14,138 - INFO : HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
2026-02-12 19:33:15,332 - INFO : HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
2026-02-12 19:33:18,406 - INFO : HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
2026-02-12 19:33:19,532 - INFO : HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
2026-02-12 19:33:20,761 - INFO : HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
2026-02-12 19:33:21,988 - INFO : HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
2026-02-12 19:33:23,762 - INFO : HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
2026-02-12 19:33:25,162 - INFO : HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK

In [34]:
pd.DataFrame(results)

Unnamed: 0,iou_mean,iou_std,recall_mean,recall_std,precision_omega_mean,precision_omega_std,precision_mean,precision_std,Method,Param,N
0,0.071491,0.055421,0.816822,0.336828,0.278425,0.142281,0.072443,0.056761,Fixed-Size,$O=0$,128
1,0.090608,0.073304,0.822958,0.345349,0.395015,0.207236,0.091829,0.075006,Recursive,$O=0$,128
2,0.090176,0.07434,0.799091,0.364234,0.395383,0.212419,0.0915,0.076112,Semantic,$Q=95$,128
3,0.09124,0.074133,0.812893,0.353815,0.400717,0.210302,0.092579,0.075877,Hierarchical,$B_h=51$,128


In [35]:
pd.DataFrame(pubmed_results)

Unnamed: 0,precision_omega_mean,precision_omega_std,iou_mean,iou_std,recall_mean,recall_std,precision_mean,precision_std,Method,Param,N
0,0.332326,0.14749,0.077619,0.061587,0.694596,0.393006,0.079566,0.062785,Fixed-Size,$O=0$,128
1,0.473584,0.220278,0.097231,0.090001,0.658639,0.426371,0.100027,0.092139,Recursive,$O=0$,128
2,0.478609,0.22084,0.098755,0.091438,0.657864,0.426532,0.101576,0.09353,Semantic,$Q=95$,128
3,0.478852,0.220908,0.098653,0.091321,0.654723,0.426583,0.1016,0.093345,Hierarchical,$B_h=51$,128


## Synthetic Evaluation

In [5]:
from lib.evaluation.segmentation.chroma.chroma_setup import setup_synthetic_evaluation

synth_eval = setup_synthetic_evaluation(
    Parsers.DOCLING,
    "chroma_eval",
    parse_exist_ok=True,
    query_exist_ok=True
)

2026-02-12 16:27:57,952 - INFO : Chroma document corpus exists at /Users/matteo/Uni/7.Semester/thesis/visual-chunking/data/configs/chroma/docling/chroma_eval.txt.Skipping generation.
2026-02-12 16:27:57,962 - INFO : Anonymized telemetry enabled. See                     https://docs.trychroma.com/telemetry for more information.
2026-02-12 16:27:58,034 - INFO : Finished creating synthetic evaluation.


In [32]:
results = []

for chunker in chunkers:
    c_info = chunker.get_info()
    print(c_info)

    c_res = synth_eval.run(chunker, embedding_function, retrieve=-1)
    del c_res["corpora_scores"]

    c_res.update(c_info)
    results.append(c_res)

{'Method': 'Fixed-Size', 'Param': '$O=0$', 'N': 128}


2026-02-12 19:10:04,801 - INFO : Chunking 018-029OLp_S3_Hodgkin-Lymphom_Erwachsene_Diagnostik_Therapie_Nachsorge_2022-04 using FIXED_SIZE...
2026-02-12 19:10:05,082 - INFO : Chunking 015-076p1_S2k_Frueher-Schwangerschaftsverlust-im-1-Trimenon_2025-06 using FIXED_SIZE...
2026-02-12 19:10:05,112 - INFO : Chunking 017-064p_S3_Chronischer_Tinnitus_2021-09 using FIXED_SIZE...
2026-02-12 19:10:05,154 - INFO : Chunking 018-032OLp_S3_Chronisch-lymphatische-Leukaemie_2021-07 using FIXED_SIZE...
2026-02-12 19:10:05,444 - INFO : Chunking 015-081p_S3_Adipositas-Schwangerschaft_2020_02 using FIXED_SIZE...
2026-02-12 19:10:05,478 - INFO : Chunking 015-076p2_S2k_Frueher-Schwangerschaftsverlust-im-1-Trimenon_2025-06 using FIXED_SIZE...
2026-02-12 19:10:08,019 - INFO : HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
2026-02-12 19:10:11,706 - INFO : HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
2026-02-12 19:10:14,882 - INFO : HTTP Request: POST http

{'Method': 'Recursive', 'Param': '$O=0$', 'N': 128}

KeyboardInterrupt: 

In [7]:
res_df = pd.DataFrame(results)
res_df.set_index(["Method", "Param"], inplace=True)
res_df.index.names = [None, None]
res_df

Unnamed: 0,Unnamed: 1,iou_mean,iou_std,recall_mean,recall_std,precision_omega_mean,precision_omega_std,precision_mean,precision_std,N
Fixed-Size,$O=0$,0.113335,0.084413,0.545414,0.351068,0.403693,0.15889,0.126384,0.098616,128
Recursive,$O=0$,0.133429,0.092111,0.515405,0.33766,0.443631,0.175662,0.158613,0.111838,128
Semantic,$Q=95$,0.137409,0.098823,0.494041,0.358025,0.44398,0.173321,0.16483,0.117221,128
Hierarchical,$B_h=51$,0.16245,0.13919,0.506803,0.343865,0.411051,0.173514,0.191374,0.160153,128


In [28]:
def combine_cols(df, metric_name: str):
    mean_col = f"{metric_name}_mean"
    std_col = f"{metric_name}_std"

    sorted_mean = df[mean_col].round(2).unique()
    sorted_mean.sort()

    max_val = sorted_mean[-1]
    second_val = sorted_mean[-2]

    def highlight_str(row):
        mean = round(row[mean_col], 2)
        std = round(row[std_col], 2)
        cell = fr"{mean:.2f} \pm {std:.2f}"

        if mean == max_val:
            return fr"$\mathbf{{{cell}}}$"
        elif mean == second_val:
            return fr"\underline{{${cell}$}}"
        else:
            return f"${cell}$"

    df[metric_name.capitalize()] = df.apply(highlight_str, axis=1)

In [31]:
combined_df = res_df.copy()

combine_cols(combined_df, "iou")
combine_cols(combined_df, "precision")
combine_cols(combined_df, "recall")
combine_cols(combined_df, "precision_omega")

combined_df = combined_df[["N", "Iou", "Recall", "Precision", "Precision_omega"]]
combined_df = combined_df.rename(columns={
    "Iou": "IoU",
    "Precision_omega": r"$\text{Precision}_\Omega$"
})

export_table_to_latex(
    combined_df,
    name="chroma_results",
    column_format="llccccc",
    escape_latex=False,
    highlight_mode=None,
)

2026-02-12 17:17:49,006 - INFO : Saved table content to: /Users/matteo/Uni/7.Semester/thesis/visual-chunking/thesis/figures/tables/chroma_results.tex
