In [1]:
%load_ext autoreload

from pandas import DataFrame

from lib.parsing.methods.implementations.document_ai import DocumentAIParser
from lib.parsing.methods.implementations.gemini import GeminiParser
from lib.parsing.methods.implementations.unstructured import UnstructuredParser
from lib.parsing.methods.implementations.llamaparse import LlamaParseParser
from lib.parsing.methods.implementations.mineru import MinerUParser
from lib.parsing.methods.implementations.docling import DoclingParser

from lib.utils.export_to_latex import export_table_to_latex
from lib.evaluation.parsing.publaynet import (
    create_publaynet_gt,
    evaluate_parser,
    get_class_metrics
)

2026-01-30 15:42:52,963 - DEBUG : Successfully initialized Logging.


In [2]:
f_50_per_class = []
f_50_95_per_class = []


def p_eval(parser):
    coco_eval = evaluate_parser(parser)
    f_50, f_50_95 = get_class_metrics(coco_eval, parser)
    f_50_per_class.append(f_50)
    f_50_95_per_class.append(f_50_95)


create_publaynet_gt(-1, exist_ok=True)

p_eval(UnstructuredParser())
p_eval(DoclingParser(use_vlm=False))
p_eval(DoclingParser(use_vlm=True))
p_eval(MinerUParser(use_vlm=False))
p_eval(MinerUParser(use_vlm=True))
p_eval(LlamaParseParser())
p_eval(DocumentAIParser())
p_eval(GeminiParser())

df_50 = DataFrame(f_50_per_class)
df_50_95 = DataFrame(f_50_95_per_class)

2026-01-30 15:43:02,529 - INFO : Started generating new PubLayNet ground truth file.
2026-01-30 15:43:02,529 - DEBUG : Downloading dataset from kenza-ily/publaynet-mini...
2026-01-30 15:43:04,872 - DEBUG : Processing PubLayNet dataset...
100%|██████████| 500/500 [00:04<00:00, 123.93it/s]
2026-01-30 15:43:08,938 - INFO : Saved ground truth file to /Users/matteo/Uni/7.Semester/thesis/visual-chunking/data/configs/publaynet/gt.json
2026-01-30 15:43:08,940 - INFO : Evaluating unstructured_io on PubLayNet...
2026-01-30 15:43:08,940 - DEBUG : Skipping Document: 92917. Output JSON already exists.
2026-01-30 15:43:08,941 - DEBUG : Skipping Document: 42713. Output JSON already exists.
2026-01-30 15:43:08,942 - DEBUG : Skipping Document: 260556. Output JSON already exists.
2026-01-30 15:43:08,942 - DEBUG : Skipping Document: 108646. Output JSON already exists.
2026-01-30 15:43:08,943 - DEBUG : Skipping Document: 325220. Output JSON already exists.
2026-01-30 15:43:08,943 - DEBUG : Skipping Docume

Fetching 1 files:   0%|          | 0/1 [00:00<?, ?it/s]

Fetching 1 files:   0%|          | 0/1 [00:00<?, ?it/s]

Fetching 1 files:   0%|          | 0/1 [00:00<?, ?it/s]

[32m2026-01-30 15:43:12.676[0m | [1mINFO    [0m | [36mmineru.backend.pipeline.model_init[0m:[36m__init__[0m:[36m271[0m - [1mDocAnalysis init done![0m
[32m2026-01-30 15:43:12.676[0m | [1mINFO    [0m | [36mmineru.backend.pipeline.pipeline_analyze[0m:[36mcustom_model_init[0m:[36m65[0m - [1mmodel init cost: 1.861520767211914[0m
2026-01-30 15:43:12,677 - INFO : Evaluating mineru_pipeline on PubLayNet...
2026-01-30 15:43:12,677 - DEBUG : Skipping Document: 92917. Output JSON already exists.
2026-01-30 15:43:12,678 - DEBUG : Skipping Document: 42713. Output JSON already exists.
2026-01-30 15:43:12,679 - DEBUG : Skipping Document: 260556. Output JSON already exists.
2026-01-30 15:43:12,680 - DEBUG : Skipping Document: 108646. Output JSON already exists.
2026-01-30 15:43:12,681 - DEBUG : Skipping Document: 325220. Output JSON already exists.
2026-01-30 15:43:12,682 - DEBUG : Skipping Document: 161545. Output JSON already exists.
2026-01-30 15:43:12,684 - DEBUG : Skipping

Fetching 14 files:   0%|          | 0/14 [00:00<?, ?it/s]

The image processor of type `Qwen2VLImageProcessor` is now loaded as a fast processor by default, even if the model checkpoint was saved with a slow processor. This is a breaking change and may produce slightly different outputs. To continue using the slow processor, instantiate this class with `use_fast=False`. Note that this behavior will be extended to all models in a future release.
[32m2026-01-30 15:43:15.824[0m | [1mINFO    [0m | [36mmineru.backend.vlm.vlm_analyze[0m:[36mget_model[0m:[36m189[0m - [1mget mlx-engine predictor cost: 1.58s[0m
2026-01-30 15:43:15,824 - INFO : Evaluating mineru_vlm on PubLayNet...
2026-01-30 15:43:15,825 - DEBUG : Skipping Document: 92917. Output JSON already exists.
2026-01-30 15:43:15,826 - DEBUG : Skipping Document: 42713. Output JSON already exists.
2026-01-30 15:43:15,827 - DEBUG : Skipping Document: 260556. Output JSON already exists.
2026-01-30 15:43:15,827 - DEBUG : Skipping Document: 108646. Output JSON already exists.
2026-01-30 1

In [3]:
df_50

Unnamed: 0,text,title,list,table,figure,all
unstructured_io,0.812313,0.803179,0.126923,0.933735,0.613848,0.621584
docling,0.8687,0.877524,0.80219,0.953027,0.595071,0.80631
docling_granite,0.673736,0.630929,0.632928,0.90012,0.181113,0.529565
mineru_pipeline,0.873526,0.95582,0.477108,0.978369,0.653447,0.652773
mineru_vlm,0.911902,0.882168,0.570194,0.979625,0.250781,0.594106
llamaparse,0.771083,0.637026,0.0,0.683096,0.0,0.410951
document_ai,0.5823,0.978864,0.0,0.991093,0.0,0.504302
gemini,0.824157,0.76186,0.127059,0.872475,0.653006,0.615318


In [4]:
df_50_95

Unnamed: 0,text,title,list,table,figure,all
unstructured_io,0.75833,0.602866,0.068179,0.870709,0.498703,0.509517
docling,0.820642,0.631126,0.740605,0.914278,0.495166,0.665003
docling_granite,0.624125,0.430189,0.569447,0.84968,0.160787,0.43817
mineru_pipeline,0.809724,0.616971,0.433118,0.940654,0.543604,0.534162
mineru_vlm,0.803198,0.446107,0.490569,0.940944,0.203374,0.433817
llamaparse,0.723951,0.305651,0.0,0.659364,0.0,0.307255
document_ai,0.530415,0.659282,0.0,0.966871,0.0,0.392779
gemini,0.73472,0.500158,0.075997,0.763644,0.582187,0.488951


In [5]:
export_table_to_latex(df_50, "f_50_publaynet")
export_table_to_latex(df_50_95, "f_50_95_publaynet")

2026-01-30 15:43:20,809 - INFO : Saved table content to: /Users/matteo/Uni/7.Semester/thesis/visual-chunking/thesis/figures/tables/f_50_publaynet.tex
2026-01-30 15:43:20,815 - INFO : Saved table content to: /Users/matteo/Uni/7.Semester/thesis/visual-chunking/thesis/figures/tables/f_50_95_publaynet.tex
