In [None]:
import os

from huggingface_hub import snapshot_download

from docling.datamodel.pipeline_options import PdfPipelineOptions, RapidOcrOptions
from docling.document_converter import (
    ConversionResult,
    DocumentConverter,
    InputFormat,
    PdfFormatOption,
)

In [None]:
 # Download RapidOCR models from HuggingFace
print("Downloading RapidOCR models")
#download_path = snapshot_download(repo_id="SWHL/RapidOCR")

# Setup RapidOcrOptions for english detection
# det_model_path = os.path.join(
#     download_path, "PP-OCRv4", "en_PP-OCRv3_det_infer.onnx"
# )
# rec_model_path = os.path.join(
#     download_path, "PP-OCRv4", "ch_PP-OCRv4_rec_server_infer.onnx"
# )
# cls_model_path = os.path.join(
#     download_path, "PP-OCRv3", "ch_ppocr_mobile_v2.0_cls_train.onnx"
# )

In [None]:
ROOT_DIR = os.getcwd()
ROOT_DIR

In [None]:
# Setup RapidOcrOptions for english detection
det_model_path = os.path.join(ROOT_DIR,
    "models", "PP-OCRv4", "en_PP-OCRv3_det_infer.onnx"
)
rec_model_path = os.path.join(ROOT_DIR,
    "models", "PP-OCRv4", "ch_PP-OCRv4_rec_server_infer.onnx"
)
cls_model_path = os.path.join(ROOT_DIR,
    "models", "PP-OCRv3", "ch_ppocr_mobile_v2.0_cls_train.onnx"
)

In [None]:
cls_model_path

In [None]:
from docling.datamodel.base_models import InputFormat
from docling.document_converter import (
    DocumentConverter,
    PdfFormatOption,
    WordFormatOption,
ImageFormatOption
)
from docling.datamodel.pipeline_options import (
    AcceleratorDevice,
    AcceleratorOptions,
    PdfPipelineOptions,
    TesseractCliOcrOptions,
    TesseractOcrOptions,
)

In [None]:
#help(ImageFormatOption)

In [None]:
ocr_options = RapidOcrOptions(
        det_model_path=det_model_path,
        rec_model_path=rec_model_path,
        cls_model_path=cls_model_path,
    )

# pipeline_options = PdfPipelineOptions(
#         ocr_options=ocr_options,
#     )
# Convert the document
# converter = DocumentConverter(
#         format_options={

#             InputFormat.IMAGE : ImageFormatOption(
#                 pipeline_options=pipeline_options,
#             ),
#         },

#     )

accelerator_options = AcceleratorOptions(
        num_threads=8, device=AcceleratorDevice.CPU
    )

pipeline_options = PdfPipelineOptions()
pipeline_options.accelerator_options = accelerator_options
pipeline_options.do_ocr = True
pipeline_options.do_table_structure = True
pipeline_options.table_structure_options.do_cell_matching = True
pipeline_options.ocr_options=ocr_options

In [None]:
converter = DocumentConverter(
        format_options={
            InputFormat.IMAGE: ImageFormatOption(
                pipeline_options=pipeline_options,
            )
        }
    )

In [None]:
from docling.datamodel.settings import settings
# Enable the profiling to measure the time spent
settings.debug.profile_pipeline_timings = True

In [None]:
source = "./images/OCR Test Document IMG.jpg"
conversion_result = converter.convert(source=source)

In [None]:
doc = conversion_result.document


In [None]:
# List with total time per document
doc_conversion_secs = conversion_result.timings["pipeline_total"].times

md = doc.export_to_text()
print(md)
print(f"Conversion secs: {doc_conversion_secs}")