# Parsing Evaluation with Chroma

In [19]:
%load_ext autoreload
%autoreload 2
from chunking_evaluation import GeneralEvaluation
from dotenv import load_dotenv
from chromadb.utils.embedding_functions import OpenAIEmbeddingFunction

import pandas as pd
import numpy as np

from lib.evaluation.segmentation.chroma.chroma_adapter import ChromaChunker
from lib.evaluation.segmentation.chroma.chroma_setup import setup_synthetic_evaluation
from lib.parsing.methods.parsers import Parsers
from lib.segmentation.methods.implementations.fixed_sized import FixedSizeChunker
from lib.segmentation.methods.implementations.recursive import RecursiveChunker

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [21]:
chunkers = [
    FixedSizeChunker(max_tokens=800, overlap=400),
    RecursiveChunker(max_tokens=800, overlap=400),
    FixedSizeChunker(max_tokens=400, overlap=200),
    RecursiveChunker(max_tokens=400, overlap=200),
    FixedSizeChunker(max_tokens=400, overlap=0),
    RecursiveChunker(max_tokens=400, overlap=0),
    FixedSizeChunker(max_tokens=200, overlap=0),
    RecursiveChunker(max_tokens=200, overlap=0),
]

Error in callback <bound method AutoreloadMagics.post_execute_hook of <IPython.extensions.autoreload.AutoreloadMagics object at 0x1101e1fd0>> (for post_execute), with arguments args (),kwargs {}:


KeyboardInterrupt: 

In [14]:
chunkers = [
    ChromaChunker(c) for c in chunkers
]

In [15]:
# Make sure "OPENAI_API_KEY" is set in a .env file
load_dotenv()
embedding_function = OpenAIEmbeddingFunction(model_name="text-embedding-3-small")

## General Evaluation (predefined data sets)

In [16]:
evaluation = GeneralEvaluation()

results = []
pubmed_results = []

for chunker in chunkers:
    c_info = chunker.get_info()

    c_res = evaluation.run(chunker, embedding_function)
    pubmed_scores = c_res["corpora_scores"]["pubmed"]
    del c_res["corpora_scores"]

    # Add method info to all-corpus-mean scores
    c_res.update(c_info)
    results.append(c_res)

    # Get data for pubmed only
    c_row = {}
    for metric, scores in pubmed_scores.items():
        metric_name = metric.replace("_scores", "")
        c_row[f"{metric_name}_mean"] = np.mean(scores)
        c_row[f"{metric_name}_std"] = np.std(scores)

    c_row.update(c_info)
    pubmed_results.append(c_row)

2025-12-15 19:02:13,685 - INFO - HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
2025-12-15 19:02:15,835 - INFO - HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 429 Too Many Requests"
2025-12-15 19:02:15,838 - INFO - Retrying request to /embeddings in 4.923000 seconds
2025-12-15 19:02:23,719 - INFO - HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
2025-12-15 19:02:25,253 - INFO - HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 429 Too Many Requests"
2025-12-15 19:02:25,256 - INFO - Retrying request to /embeddings in 5.505000 seconds
2025-12-15 19:02:32,935 - INFO - HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
2025-12-15 19:02:35,569 - INFO - HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 429 Too Many Requests"
2025-12-15 19:02:35,571 - INFO - Retrying request to /embeddings in 6.867000 seconds
2025-12-15 19:02:44,712 - INFO - HTTP Request: POST https:

In [17]:
pd.DataFrame(results)

Unnamed: 0,iou_mean,iou_std,recall_mean,recall_std,precision_omega_mean,precision_omega_std,precision_mean,precision_std,method,max_tokens,overlap
0,0.013484,0.011374,0.879179,0.316866,0.045663,0.030262,0.013488,0.011375,FixedSizeChunker,800,400
1,0.016023,0.014594,0.856036,0.346719,0.068034,0.053831,0.016026,0.014596,RecursiveChunker,800,400
2,0.02795,0.021816,0.9189,0.256813,0.084153,0.052076,0.027974,0.021823,FixedSizeChunker,400,200
3,0.031794,0.02844,0.865487,0.334225,0.126718,0.087666,0.031812,0.028453,RecursiveChunker,400,200
4,0.026476,0.022138,0.862473,0.326714,0.120204,0.078366,0.026504,0.022146,FixedSizeChunker,400,0
5,0.032294,0.02793,0.871127,0.326619,0.147479,0.100532,0.032322,0.027967,RecursiveChunker,400,0
6,0.051228,0.041949,0.863683,0.308166,0.209428,0.122358,0.051415,0.042049,FixedSizeChunker,200,0
7,0.063458,0.051763,0.866807,0.318831,0.260989,0.155548,0.063774,0.052115,RecursiveChunker,200,0


In [18]:
pd.DataFrame(pubmed_results)

Unnamed: 0,precision_omega_mean,precision_omega_std,iou_mean,iou_std,recall_mean,recall_std,precision_mean,precision_std,method,max_tokens,overlap
0,0.060083,0.034847,0.018083,0.013557,0.845734,0.343335,0.018092,0.013557,FixedSizeChunker,800,400
1,0.081863,0.058511,0.022484,0.018533,0.819775,0.374184,0.022491,0.018531,RecursiveChunker,800,400
2,0.107507,0.058727,0.035788,0.025321,0.845141,0.334229,0.035877,0.025315,FixedSizeChunker,400,200
3,0.163856,0.1077,0.042158,0.040043,0.747848,0.420396,0.042213,0.040062,RecursiveChunker,400,200
4,0.154159,0.095123,0.032689,0.026631,0.770264,0.393299,0.032753,0.026622,FixedSizeChunker,400,0
5,0.196211,0.123951,0.042903,0.036476,0.784855,0.397697,0.042998,0.036575,RecursiveChunker,400,0
6,0.252038,0.134686,0.056718,0.047214,0.712142,0.39631,0.057334,0.047538,FixedSizeChunker,200,0
7,0.31629,0.175872,0.080664,0.062101,0.775903,0.373964,0.081804,0.063,RecursiveChunker,200,0


## Synthetic Evaluation

In [5]:
synth_eval = setup_synthetic_evaluation(Parsers.DOCLING, "batch_1", query_exist_ok=True)

4it [00:00, 90.17it/s]
2025-12-13 20:20:42,416 - INFO - Anonymized telemetry enabled. See                     https://docs.trychroma.com/telemetry for more information.


Skipping Document: ESMO_Breast_Cancer. Output JSON already exists.
Skipping Document: NCCN_breast_p1. Output JSON already exists.
Skipping Document: NCCN_breast. Output JSON already exists.
Skipping Document: Contouring_ESTRO_breast_and_LAW. Output JSON already exists.
Info: Scanning /Users/matteo/Uni/7.Semester/thesis/visual-chunking/data/configs/chroma/docling/batch_1 for text documents...
Info: Using 4 documents to create synthetic Chroma dataset...
Trying Query 0 for /Users/matteo/Uni/7.Semester/thesis/visual-chunking/data/configs/chroma/docling/batch_1/ESMO_Breast_Cancer.txt


2025-12-13 20:21:08,324 - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"


Trying Query 0 for /Users/matteo/Uni/7.Semester/thesis/visual-chunking/data/configs/chroma/docling/batch_1/NCCN_breast_p1.txt


2025-12-13 20:21:40,151 - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"


Trying Query 0 for /Users/matteo/Uni/7.Semester/thesis/visual-chunking/data/configs/chroma/docling/batch_1/NCCN_breast.txt


2025-12-13 20:21:50,552 - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"


Trying Query 0 for /Users/matteo/Uni/7.Semester/thesis/visual-chunking/data/configs/chroma/docling/batch_1/Contouring_ESTRO_breast_and_LAW.txt


2025-12-13 20:22:28,136 - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"


Success: Created synthetic evaluation!


In [7]:
synth_res = synth_eval.run(chunker=chunkers[0], embedding_function=embedding_function)

Chunking ESMO_Breast_Cancer using FIXED_SIZE...
Info: Directory already exists: /Users/matteo/Uni/7.Semester/thesis/visual-chunking/data/segmentation/fixed_size/docling/batch_1
Chunking NCCN_breast_p1 using FIXED_SIZE...
Info: Directory already exists: /Users/matteo/Uni/7.Semester/thesis/visual-chunking/data/segmentation/fixed_size/docling/batch_1
Chunking NCCN_breast using FIXED_SIZE...


Token indices sequence length is longer than the specified maximum sequence length for this model (628 > 512). Running this sequence through the model will result in indexing errors


Info: Directory already exists: /Users/matteo/Uni/7.Semester/thesis/visual-chunking/data/segmentation/fixed_size/docling/batch_1
Chunking Contouring_ESTRO_breast_and_LAW using FIXED_SIZE...
Info: Directory already exists: /Users/matteo/Uni/7.Semester/thesis/visual-chunking/data/segmentation/fixed_size/docling/batch_1


2025-12-13 20:22:38,889 - INFO - HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
2025-12-13 20:22:41,959 - INFO - HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
2025-12-13 20:22:45,649 - INFO - HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
2025-12-13 20:22:49,548 - INFO - HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
2025-12-13 20:22:51,790 - INFO - HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"


In [8]:
del synth_res["corpora_scores"]

In [9]:
pd.DataFrame([synth_res])

Unnamed: 0,iou_mean,iou_std,recall_mean,recall_std,precision_omega_mean,precision_omega_std,precision_mean,precision_std
0,0.035489,0.02647,0.676473,0.428597,0.11777,0.042573,0.036114,0.027236
