In [1]:
import json
import logging
import time
from pathlib import Path

In [2]:
from docling.backend.pypdfium2_backend import PyPdfiumDocumentBackend
from docling.datamodel.base_models import InputFormat
from docling.datamodel.pipeline_options import (
    AcceleratorDevice,
    AcceleratorOptions,
    PdfPipelineOptions,
)
from docling.document_converter import DocumentConverter, PdfFormatOption
from docling.models.ocr_mac_model import OcrMacOptions
from docling.models.tesseract_ocr_cli_model import TesseractCliOcrOptions
from docling.models.tesseract_ocr_model import TesseractOcrOptions
from docling.models.easyocr_model import  EasyOcrOptions
from docling.datamodel.settings import PageRange

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
_log = logging.getLogger(__name__)

In [4]:

logging.basicConfig(level=logging.INFO)

input_doc_path = Path("/home/mugesh/Downloads/ijsst07.pdf")

###########################################################################

# The following sections contain a combination of PipelineOptions
# and PDF Backends for various configurations.
# Uncomment one section at the time to see the differences in the output.

# PyPdfium without EasyOCR
# --------------------
# pipeline_options = PdfPipelineOptions()
# pipeline_options.do_ocr = False
# pipeline_options.do_table_structure = True
# pipeline_options.table_structure_options.do_cell_matching = False

# doc_converter = DocumentConverter(
#     format_options={
#         InputFormat.PDF: PdfFormatOption(
#             pipeline_options=pipeline_options, backend=PyPdfiumDocumentBackend
#         )
#     }
# )

# PyPdfium with EasyOCR
# -----------------
# pipeline_options = PdfPipelineOptions()
# pipeline_options.do_ocr = True
# pipeline_options.do_table_structure = True
# pipeline_options.table_structure_options.do_cell_matching = True

# doc_converter = DocumentConverter(
#     format_options={
#         InputFormat.PDF: PdfFormatOption(
#             pipeline_options=pipeline_options, backend=PyPdfiumDocumentBackend
#         )
#     }
# )

# Docling Parse without EasyOCR
# -------------------------
# pipeline_options = PdfPipelineOptions()
# pipeline_options.do_ocr = False
# pipeline_options.do_table_structure = True
# pipeline_options.table_structure_options.do_cell_matching = True

# doc_converter = DocumentConverter(
#     format_options={
#         InputFormat.PDF: PdfFormatOption(pipeline_options=pipeline_options)
#     }
# )

# Docling Parse with EasyOCR
# # ----------------------
# pipeline_options = PdfPipelineOptions()
# pipeline_options.do_ocr = True
# pipeline_options.do_table_structure = True
# pipeline_options.table_structure_options.do_cell_matching = True
# pipeline_options.ocr_options.lang = ["es"]
# pipeline_options.accelerator_options = AcceleratorOptions(
#     num_threads=4, device=AcceleratorDevice.AUTO
# )
# pipeline_options.artifacts_path = Path("/home/mugesh/.cache/docling/models")

# doc_converter = DocumentConverter(
#     format_options={
#         InputFormat.PDF: PdfFormatOption(pipeline_options=pipeline_options)
#     }
# )

# Docling Parse with EasyOCR (CPU only)
# ----------------------
# pipeline_options = PdfPipelineOptions()
# pipeline_options.do_ocr = True
# pipeline_options.ocr_options.use_gpu = False  # <-- set this.
# pipeline_options.do_table_structure = True
# pipeline_options.table_structure_options.do_cell_matching = True

# doc_converter = DocumentConverter(
#     format_options={
#         InputFormat.PDF: PdfFormatOption(pipeline_options=pipeline_options)
#     }
# )

# Docling Parse with Tesseract
# ----------------------
pipeline_options = PdfPipelineOptions()
pipeline_options.do_ocr = True
pipeline_options.do_table_structure = True
pipeline_options.table_structure_options.do_cell_matching = True
pipeline_options.ocr_options = EasyOcrOptions()
pipeline_options.artifacts_path = Path("/home/mugesh/.cache/docling/models")

pipeline_options.accelerator_options = AcceleratorOptions(
    num_threads=4, device=AcceleratorDevice.AUTO
)

doc_converter = DocumentConverter(
    format_options={
        InputFormat.PDF: PdfFormatOption(pipeline_options=pipeline_options)
    }
)

# Docling Parse with Tesseract CLI
# ----------------------
# pipeline_options = PdfPipelineOptions()
# pipeline_options.do_ocr = True
# pipeline_options.do_table_structure = True
# pipeline_options.table_structure_options.do_cell_matching = True
# pipeline_options.ocr_options = TesseractCliOcrOptions()

# doc_converter = DocumentConverter(
#     format_options={
#         InputFormat.PDF: PdfFormatOption(pipeline_options=pipeline_options)
#     }
# )

# Docling Parse with ocrmac(Mac only)
# ----------------------
# pipeline_options = PdfPipelineOptions()
# pipeline_options.do_ocr = True
# pipeline_options.do_table_structure = True
# pipeline_options.table_structure_options.do_cell_matching = True
# pipeline_options.ocr_options = OcrMacOptions()

# doc_converter = DocumentConverter(
#     format_options={
#         InputFormat.PDF: PdfFormatOption(pipeline_options=pipeline_options)
#     }
# )


In [5]:
from docling.datamodel.base_models import ConversionStatus
from docling.datamodel.document import ConversionResult
from docling.datamodel.settings import settings
from docling.document_converter import DocumentConverter

In [6]:
 # Turn on inline debug visualizations:
settings.debug.visualize_layout = True
settings.debug.visualize_ocr = True
settings.debug.visualize_tables = True
settings.debug.visualize_cells = True

# Enable the profiling to measure the time spent
settings.debug.profile_pipeline_timings = True

In [7]:

start_time = time.time()
conv_result = doc_converter.convert(input_doc_path)
end_time = time.time() - start_time

_log.info(f"Document converted in {end_time:.2f} seconds.")


INFO:docling.document_converter:Going to convert document batch...
INFO:docling.utils.accelerator_utils:Accelerator device: 'cuda:0'
INFO:docling.utils.accelerator_utils:Accelerator device: 'cuda:0'
If this is not desired, please set os.environ['TORCH_CUDA_ARCH_LIST'].
INFO:docling.utils.accelerator_utils:Accelerator device: 'cuda:0'
INFO:docling.pipeline.base_pipeline:Processing document ijsst07.pdf
INFO:docling.document_converter:Finished converting document ijsst07.pdf in 19.60 sec.
INFO:__main__:Document converted in 19.60 seconds.


In [8]:
 # List with total time per document
doc_conversion_secs = conv_result.timings["pipeline_total"].times
doc_conversion_secs

[15.857291930999054]

In [9]:
conv_result.pages

[Page(page_no=0, size=Size(width=595.0, height=842.0), cells=[Cell(id=0, text='10                     ISSN 1473-804x online, 1473-8031 print', bbox=BoundingBox(l=286.8, t=794.79, r=520.2, b=808.092, coord_origin=<CoordOrigin.TOPLEFT: 'TOPLEFT'>)), Cell(id=1, text='E. CASALICCHIO, R.LANCELLOTTI, M.E. POLEGGI: SIMULATION FRAMEWORK', bbox=BoundingBox(l=115.2, t=35.989999999999895, r=480.17999999999995, b=47.05999999999983, coord_origin=<CoordOrigin.TOPLEFT: 'TOPLEFT'>)), Cell(id=2, text='A SIMULATION FRAMEWORK FOR CLUSTER-BASED WEB ', bbox=BoundingBox(l=72.4, t=70.94399999999996, r=527.1, b=88.65599999999995, coord_origin=<CoordOrigin.TOPLEFT: 'TOPLEFT'>)), Cell(id=3, text='SERVICES', bbox=BoundingBox(l=258.0, t=88.644, r=337.48, b=106.356, coord_origin=<CoordOrigin.TOPLEFT: 'TOPLEFT'>)), Cell(id=4, text='EMILIANO CASALICCHIO', bbox=BoundingBox(l=226.3, t=117.4079999999999, r=369.188, b=130.692, coord_origin=<CoordOrigin.TOPLEFT: 'TOPLEFT'>)), Cell(id=5, text='Dipartimento di Informatica,

In [10]:
for page in conv_result.pages:
    print(page.page_no)
    # print(page.cells)
    for j in page.cells:
        print(j.text)

0
10                     ISSN 1473-804x online, 1473-8031 print
E. CASALICCHIO, R.LANCELLOTTI, M.E. POLEGGI: SIMULATION FRAMEWORK
A SIMULATION FRAMEWORK FOR CLUSTER-BASED WEB 
SERVICES
EMILIANO CASALICCHIO
Dipartimento di Informatica, Sistemi e Produzione
Università di Roma 'Tor Vergata'
E-mail: casalicchio@uniroma2.it
RICCARDO LANCELLOTTI
Dipartimento di Iingegneria dell'Informazione
Università di Modena e Reggio Emilia
E-mail:riccardo.lancellotti@unimore.it
MARCO EMILIO POLEGGI
CERN-IT/INFN-CNAF
E-mail: Marco.Emilio.Poleggi@cern.ch
Abstract: 
We propose a simulation framework, namely CWebSim, specifically designed for the performance 
evaluation and capacity planning of cluster-based Web services. A broad variety of Web cluster configurations 
can be simulated through CWebSim. Its modularity permits the definition of different mechanisms, algorithms, 
network topologies and hardware resources. Also, two workload input alternatives are possible: a trace-driven 
mode   and   a   distri

In [11]:
page

Page(page_no=12, size=Size(width=595.0, height=842.0), cells=[Cell(id=0, text='22                     ISSN 1473-804x online, 1473-8031 print', bbox=BoundingBox(l=286.8, t=794.79, r=520.2, b=808.092, coord_origin=<CoordOrigin.TOPLEFT: 'TOPLEFT'>)), Cell(id=1, text='E. CASALICCHIO, R.LANCELLOTTI, M.E. POLEGGI: SIMULATION FRAMEWORK', bbox=BoundingBox(l=115.2, t=35.989999999999895, r=480.17999999999995, b=47.05999999999983, coord_origin=<CoordOrigin.TOPLEFT: 'TOPLEFT'>)), Cell(id=2, text='He acts as reviewer for international conferences, ', bbox=BoundingBox(l=71.0, t=70.88999999999999, r=286.1, b=81.96000000000004, coord_origin=<CoordOrigin.TOPLEFT: 'TOPLEFT'>)), Cell(id=3, text='and   journals:   IEEE   Transaction   on   Software ', bbox=BoundingBox(l=71.0, t=81.9899999999999, r=286.1, b=93.05999999999983, coord_origin=<CoordOrigin.TOPLEFT: 'TOPLEFT'>)), Cell(id=4, text='Engineering   (2007),   Computer   Communication ', bbox=BoundingBox(l=71.0, t=93.09000000000003, r=286.1, b=104.1599

In [12]:
for j in conv_result.timings:
    print(j, conv_result.timings[j].times)

pipeline_total [15.857291930999054]
doc_build [15.789905132998683]
page_init [0.15833303700128454, 0.2960027010012709, 0.45348903899866855, 0.33721452499958104, 0.3169649070005107, 0.37713186999826576, 0.3145197609992465, 0.2929822490004881, 0.0667410980022396, 0.21317528399958974, 0.39888344499922823, 0.2877678750010091, 0.08243680299710832]
page_parse [0.07082551399798831, 0.06348976199660683, 0.0655707789992448, 0.06304189799993765, 0.06528262799838558, 0.06655179000154021, 0.06435200600026292, 0.06404953600213048, 0.054617703000985784, 0.05803527899843175, 0.06426855800236808, 0.0575259110009938, 0.040817245997459395]
ocr [0.5422754720020748, 0.5497732699986955, 0.14030805499714916, 0.3736414249979134, 0.6924525350004842, 0.5086236480019579, 0.5055616760000703, 0.8703530349994253, 1.641437483001937, 0.7890552690005279, 0.13994437299697893, 0.14122671000222908, 0.13747321800110512]
layout [0.26373823000176344, 0.12021008699957747, 0.11462922599821468, 0.12354107700230088, 0.12586055

In [None]:
print("\nDetailed Profiling Summary:")
for stage, timing in conv_result.timings.items():
    times = timing.times  # Expect a list of timing values.
    if isinstance(times, list) and len(times) > 1:
        # Calculate summary statistics for stages with multiple (page-wise) timings.
        avg_time = sum(times) / len(times)
        min_time = min(times)
        max_time = max(times)
        print(f"Stage: {stage}")
        print(f"  Page-wise timings: {times}")
        print(f"  Average: {avg_time:.4f} sec, Min: {min_time:.4f} sec, Max: {max_time:.4f} sec")
    elif isinstance(times, list):
        # Only one timing value present
        print(f"Stage: {stage} -- Time: {times[0]:.4f} sec")
    else:
        # Fallback if timing isn't a list
        print(f"Stage: {stage} -- Time: {times:.4f} sec")


Detailed Profiling Summary:
Stage: pipeline_total -- Time: 15.8294 sec
Stage: doc_build -- Time: 15.7532 sec
Stage: page_init
  Page-wise timings: [0.1662265339982696, 0.3463448349975806, 0.4710821059998125, 0.32157275400095386, 0.31095493999964674, 0.40180187299847603, 0.3110117309988709, 0.2979890719980176, 0.07001416899947799, 0.20916492399919662, 0.39909250599885127, 0.29217016299662646, 0.08404280100148753]
  Average: 0.2832 sec, Min: 0.0700 sec, Max: 0.4711 sec
Stage: page_parse
  Page-wise timings: [0.06609504399966681, 0.06357274300171412, 0.06758854799772962, 0.0637470689980546, 0.06738299900098355, 0.0669565349999175, 0.06353413900069427, 0.06816727200202877, 0.05496784700153512, 0.05921818400020129, 0.06470415799776674, 0.05893914299667813, 0.04297425700133317]
  Average: 0.0621 sec, Min: 0.0430 sec, Max: 0.0682 sec
Stage: ocr
  Page-wise timings: [0.38407244400150375, 0.5271335200013709, 0.14119525800197152, 0.3685695089989167, 0.6824890700008837, 0.6712431289997767, 0.517

In [13]:
conv_result.assembled.elements

[TextElement(label=<DocItemLabel.PAGE_FOOTER: 'page_footer'>, id=6, page_no=0, cluster=Cluster(id=6, label=<DocItemLabel.PAGE_FOOTER: 'page_footer'>, bbox=BoundingBox(l=286.8, t=794.79, r=520.2, b=808.092, coord_origin=<CoordOrigin.TOPLEFT: 'TOPLEFT'>), confidence=0.8682352900505066, cells=[Cell(id=0, text='10                     ISSN 1473-804x online, 1473-8031 print', bbox=BoundingBox(l=286.8, t=794.79, r=520.2, b=808.092, coord_origin=<CoordOrigin.TOPLEFT: 'TOPLEFT'>))], children=[]), text='10                     ISSN 1473-804x online, 1473-8031 print'),
 TextElement(label=<DocItemLabel.PAGE_HEADER: 'page_header'>, id=18, page_no=0, cluster=Cluster(id=18, label=<DocItemLabel.PAGE_HEADER: 'page_header'>, bbox=BoundingBox(l=115.2, t=35.989999999999895, r=480.17999999999995, b=47.05999999999983, coord_origin=<CoordOrigin.TOPLEFT: 'TOPLEFT'>), confidence=0.7155863642692566, cells=[Cell(id=1, text='E. CASALICCHIO, R.LANCELLOTTI, M.E. POLEGGI: SIMULATION FRAMEWORK', bbox=BoundingBox(l=115

In [14]:
from docling_core.transforms.chunker.hybrid_chunker import HybridChunker

In [15]:

chunker = HybridChunker()
chunk_iter = chunker.chunk(dl_doc=conv_result.document)

Token indices sequence length is longer than the specified maximum sequence length for this model (543 > 512). Running this sequence through the model will result in indexing errors


In [16]:
for chunk in chunk_iter:
    print(chunk.text)

    print([j.prov[0].page_no for j in chunk.meta.doc_items])
    print("-"*100)
    # break

Dipartimento di Informatica, Sistemi e Produzione
Università di Roma 'Tor Vergata'
E-mail: casalicchio@uniroma2.it
[1, 1, 1]
----------------------------------------------------------------------------------------------------
Dipartimento di Iingegneria dell'Informazione
Università di Modena e Reggio Emilia
E-mail:riccardo.lancellotti@unimore.it
[1, 1, 1]
----------------------------------------------------------------------------------------------------
CERN-IT/INFN-CNAF
E-mail: Marco.Emilio.Poleggi@cern.ch
Abstract: We propose a simulation framework, namely CWebSim, specifically designed for the performance evaluation and capacity planning of cluster-based Web services. A broad variety of Web cluster configurations can be simulated through CWebSim. Its modularity permits the definition of different mechanisms, algorithms, network topologies and hardware resources. Also, two workload input alternatives are possible: a trace-driven mode   and   a   distribution-driven   mode   that   e

In [17]:
##########################################################################

## Export results
output_dir = Path("scratch")
output_dir.mkdir(parents=True, exist_ok=True)
doc_filename = conv_result.input.file.stem

# Export Deep Search document JSON format:
with (output_dir / f"{doc_filename}.json").open("w", encoding="utf-8") as fp:
    fp.write(json.dumps(conv_result.document.export_to_dict()))

# Export Text format:
with (output_dir / f"{doc_filename}.txt").open("w", encoding="utf-8") as fp:
    fp.write(conv_result.document.export_to_text())

# Export Markdown format:
with (output_dir / f"{doc_filename}.md").open("w", encoding="utf-8") as fp:
    fp.write(conv_result.document.export_to_markdown())

# Export Document Tags format:
with (output_dir / f"{doc_filename}.doctags").open("w", encoding="utf-8") as fp:
    fp.write(conv_result.document.export_to_document_tokens())