# Parsing Evaluation with Chroma

In [1]:
%load_ext autoreload
%autoreload 2
from chunking_evaluation import GeneralEvaluation
from dotenv import load_dotenv
from chromadb.utils.embedding_functions import OpenAIEmbeddingFunction

import pandas as pd
import numpy as np

from evaluation.segmentation.chroma.chroma_adapter import ChromaChunker
from evaluation.segmentation.chroma.chroma_setup import setup_synthetic_evaluation
from parsing.methods.config import Parsers
from segmentation.methods.fixed_sized import FixedSizeChunker
from segmentation.methods.recursive import RecursiveChunker

  "Respond only with the IDs of the chunks where you believe a split should occur. YOU MUST RESPOND WITH AT LEAST ONE SPLIT. THESE SPLITS MUST BE IN ASCENDING ORDER AND EQUAL OR LARGER THAN: " + str(current_chunk)+"." + (f"\n\The previous response of {invalid_response} was invalid. DO NOT REPEAT THIS ARRAY OF NUMBERS. Please try again." if invalid_response else "")
  from .autonotebook import tqdm as notebook_tqdm


In [2]:
chunkers = [
    # FixedSizeChunker(max_tokens=800, overlap=400),
    # RecursiveChunker(max_tokens=800, overlap=400),
    FixedSizeChunker(max_tokens=400, overlap=200),
    # RecursiveChunker(max_tokens=400, overlap=200),
    # FixedSizeChunker(max_tokens=400, overlap=0),
    # RecursiveChunker(max_tokens=400, overlap=0),
    # FixedSizeChunker(max_tokens=200, overlap=0),
    # RecursiveChunker(max_tokens=200, overlap=0),
]

In [3]:
chunkers = [
    ChromaChunker(c) for c in chunkers
]

In [4]:
# Make sure "OPENAI_API_KEY" is set in a .env file
load_dotenv()
embedding_function = OpenAIEmbeddingFunction(model_name="text-embedding-3-large")

## General Evaluation (predefined data sets)

In [5]:
evaluation = GeneralEvaluation()

results = []
pubmed_results = []

for chunker in chunkers:
    c_res = evaluation.run(chunker, embedding_function)
    pubmed_scores = c_res["corpora_scores"]["pubmed"]
    del c_res["corpora_scores"]

    c_row = {}
    results.append(c_res)

    for metric, scores in pubmed_scores.items():
        metric_name = metric.replace("_scores", "")
        c_row[f"{metric_name}_mean"] = np.mean(scores)
        c_row[f"{metric_name}_std"] = np.std(scores)

    pubmed_results.append(c_row)

2025-12-13 20:41:14,417 - INFO - Anonymized telemetry enabled. See                     https://docs.trychroma.com/telemetry for more information.
Token indices sequence length is longer than the specified maximum sequence length for this model (25022 > 512). Running this sequence through the model will result in indexing errors
2025-12-13 20:41:21,350 - INFO - HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
2025-12-13 20:41:23,631 - INFO - HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
2025-12-13 20:41:26,055 - INFO - HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
2025-12-13 20:41:28,346 - INFO - HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
2025-12-13 20:41:30,870 - INFO - HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
2025-12-13 20:41:33,303 - INFO - HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
2025-12-13 20:41:35,785 - IN

In [6]:
pd.DataFrame(results)

Unnamed: 0,iou_mean,iou_std,recall_mean,recall_std,precision_omega_mean,precision_omega_std,precision_mean,precision_std
0,0.026264,0.021841,0.866622,0.323998,0.084227,0.052051,0.026295,0.021859


In [7]:
pd.DataFrame(pubmed_results)

Unnamed: 0,precision_omega_mean,precision_omega_std,iou_mean,iou_std,recall_mean,recall_std,precision_mean,precision_std
0,0.107834,0.058487,0.031702,0.025576,0.752538,0.401393,0.031813,0.025604


## Synthetic Evaluation

In [5]:
synth_eval = setup_synthetic_evaluation(Parsers.DOCLING, "batch_1", query_exist_ok=True)

4it [00:00, 90.17it/s]
2025-12-13 20:20:42,416 - INFO - Anonymized telemetry enabled. See                     https://docs.trychroma.com/telemetry for more information.


Skipping Document: ESMO_Breast_Cancer. Output JSON already exists.
Skipping Document: NCCN_breast_p1. Output JSON already exists.
Skipping Document: NCCN_breast. Output JSON already exists.
Skipping Document: Contouring_ESTRO_breast_and_LAW. Output JSON already exists.
Info: Scanning /Users/matteo/Uni/7.Semester/thesis/visual-chunking/data/configs/chroma/docling/batch_1 for text documents...
Info: Using 4 documents to create synthetic Chroma dataset...
Trying Query 0 for /Users/matteo/Uni/7.Semester/thesis/visual-chunking/data/configs/chroma/docling/batch_1/ESMO_Breast_Cancer.txt


2025-12-13 20:21:08,324 - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"


Trying Query 0 for /Users/matteo/Uni/7.Semester/thesis/visual-chunking/data/configs/chroma/docling/batch_1/NCCN_breast_p1.txt


2025-12-13 20:21:40,151 - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"


Trying Query 0 for /Users/matteo/Uni/7.Semester/thesis/visual-chunking/data/configs/chroma/docling/batch_1/NCCN_breast.txt


2025-12-13 20:21:50,552 - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"


Trying Query 0 for /Users/matteo/Uni/7.Semester/thesis/visual-chunking/data/configs/chroma/docling/batch_1/Contouring_ESTRO_breast_and_LAW.txt


2025-12-13 20:22:28,136 - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"


Success: Created synthetic evaluation!


In [7]:
synth_res = synth_eval.run(chunker=chunkers[0], embedding_function=embedding_function)

Chunking ESMO_Breast_Cancer using FIXED_SIZE...
Info: Directory already exists: /Users/matteo/Uni/7.Semester/thesis/visual-chunking/data/segmentation/fixed_size/docling/batch_1
Chunking NCCN_breast_p1 using FIXED_SIZE...
Info: Directory already exists: /Users/matteo/Uni/7.Semester/thesis/visual-chunking/data/segmentation/fixed_size/docling/batch_1
Chunking NCCN_breast using FIXED_SIZE...


Token indices sequence length is longer than the specified maximum sequence length for this model (628 > 512). Running this sequence through the model will result in indexing errors


Info: Directory already exists: /Users/matteo/Uni/7.Semester/thesis/visual-chunking/data/segmentation/fixed_size/docling/batch_1
Chunking Contouring_ESTRO_breast_and_LAW using FIXED_SIZE...
Info: Directory already exists: /Users/matteo/Uni/7.Semester/thesis/visual-chunking/data/segmentation/fixed_size/docling/batch_1


2025-12-13 20:22:38,889 - INFO - HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
2025-12-13 20:22:41,959 - INFO - HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
2025-12-13 20:22:45,649 - INFO - HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
2025-12-13 20:22:49,548 - INFO - HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
2025-12-13 20:22:51,790 - INFO - HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"


In [8]:
del synth_res["corpora_scores"]

In [9]:
pd.DataFrame([synth_res])

Unnamed: 0,iou_mean,iou_std,recall_mean,recall_std,precision_omega_mean,precision_omega_std,precision_mean,precision_std
0,0.035489,0.02647,0.676473,0.428597,0.11777,0.042573,0.036114,0.027236
