## Data Extraction with Docling

In this notebook, we'll extract content from PDFs into structured formats:

- **Markdown**: Full document text with page breaks for chunking
- **Images**: Save pages containing large charts/diagrams (>500x500 pixels)
- **Tables**: Extract with 2 paragraphs of context + page number metadata

**Output Structure:**
```
data/rag-data/markdown/{company}/{document}.md
data/rag-data/images/{company}/{document}/page_5.png
data/rag-data/tables/{company}/{document}/table_1_page_5.md
```

https://github.com/docling-project/docling

### 1. Setup and Configuration

In [None]:
from pathlib import Path
from typing import List, Tuple

from docling_core.types.doc import PictureItem
from docling.datamodel.base_models import InputFormat
from docling.datamodel.pipeline_options import PdfPipelineOptions
from docling.document_converter import DocumentConverter, PdfFormatOption

In [None]:
# Directory paths
DATA_DIR = "data/rag-data/pdfs"
OUTPUT_MD_DIR = "data/rag-data/markdown"
OUTPUT_IMAGES_DIR = "data/rag-data/images"
OUTPUT_TABLES_DIR = "data/rag-data/tables"

### Metadata Extraction

In [None]:
def extract_metadata_from_filename(filename: str):
    """
    Extract metadata from filename.
    
    Expected format: CompanyName DocType [Quarter] Year.pdf
    Examples:
        - Amazon 10-Q Q1 2024.pdf
        - Microsoft 10-K 2023.pdf
    """

    filename = filename.replace('.pdf', '').replace('.md', '')
    parts = filename.split()

    return {
        'company_name': parts[0],
        'doc_type': parts[1],
        'fiscal_quarter': parts[2] if len(parts)==4 else None,
        'fiscal_year': parts[-1]
    }

extract_metadata_from_filename('apple 10-k 2023.pdf')

{'company_name': 'apple',
 'doc_type': '10-k',
 'fiscal_quarter': None,
 'fiscal_year': '2023'}

### Extract Markdown

In [None]:
def convert_pdf_to_docling(pdf_file: Path):

    pipeline_options = PdfPipelineOptions()
    pipeline_options.images_scale = 2
    pipeline_options.generate_picture_images = True
    pipeline_options.generate_page_images = True

    doc_converter = DocumentConverter(
        format_options={
            InputFormat.PDF: PdfFormatOption(pipeline_options=pipeline_options)
        }
    )

    return doc_converter.convert(pdf_file)

In [None]:
def save_page_images(doc_converter, images_dir: Path):
    """
    Find and save pages with large images (>500x500 pixels).
    """

    pages_to_save = set()

    for item in doc_converter.document.iterate_items():
        element = item[0]

        if isinstance(element, PictureItem):
            image = element.get_image(doc_converter.document)

            if image.size[0]>500 and image.size[1]>500:
                page_no = element.prov[0].page_no if element.prov else None

                if page_no:
                    pages_to_save.add(page_no)


        # save images
        for page_no in pages_to_save:
            page = doc_converter.document.pages[page_no]

            page.image.pil_image.save(images_dir/ f"page_{page_no}.png", "PNG")


In [None]:
def extract_context_and_table(lines: List[str], table_index: int):
    """
    Extract context and table content at a specific position.
    
    Args:
        lines: All markdown lines
        table_index: Where the table starts
    
    Returns:
        (combined_content, next_line_index)
    """

    table_lines = []
    i = table_index

    while (i < len(lines)) and (lines[i].startswith('|')):
        table_lines.append(lines[i])
        i = i + 1


    # previous 2 lines as table context
    start = max(0, table_index-2)
    context_lines = lines[start: table_index]

    content = '\n'.join(context_lines) + '\n\n' + '\n'.join(table_lines)

    return content, i
    

In [None]:
def extract_tables_with_context(markdown_text: str):
    """
    Find all tables and extract them with context and page numbers.
    
    Returns:
        List of (content, table_name, page_number)
    """

    lines = markdown_text.split('\n')
    lines = [line for line in lines if line.strip()]
    tables = []
    current_page = 1
    table_num = 1
    i = 0

    while(i< len(lines)):
        # track page numbers
        if '<!-- page break -->' in lines[i]:
            current_page = current_page + 1
            i = i + 1
            continue

        # Table detected
        if lines[i].startswith('|') and lines[i].count('|')>1:
            content, next_i = extract_context_and_table(lines, i)

            tables.append((content, f"table_{table_num}", current_page))
            table_num = table_num + 1
            i = next_i

        else:
            i = i + 1


    return tables
    

In [None]:
def save_tables(markdown_text, tables_dir):

    tables = extract_tables_with_context(markdown_text)

    for table_content, table_name, page_num in tables:
        content_with_page = f"**Page:** {page_num}\n\n{table_content}"
                
        (tables_dir/f"{table_name}_page_{page_num}.md").write_text(content_with_page, encoding='utf-8')


In [None]:
def extract_pdf_content(pdf_file):
    metadata = extract_metadata_from_filename(pdf_file.stem)

    company_name = metadata['company_name']

    md_dir = Path(OUTPUT_MD_DIR) / company_name
    images_dir = Path(OUTPUT_IMAGES_DIR) / company_name / pdf_file.stem
    tables_dir = Path(OUTPUT_TABLES_DIR) / company_name / pdf_file.stem

    for dir_path in [md_dir, images_dir, tables_dir]:
        dir_path.mkdir(parents=True, exist_ok=True)


    doc_converter = convert_pdf_to_docling(pdf_file)

    markdown_text = doc_converter.document.export_to_markdown(page_break_placeholder="<!-- page break -->")

    (md_dir / f"{pdf_file.stem}.md").write_text(markdown_text, encoding='utf-8')

    save_page_images(doc_converter, images_dir)

    save_tables(markdown_text, tables_dir)


In [None]:
# pdf_file = Path('data\\rag-data\\pdfs\\apple\\apple 8-k q4 2023.pdf')

# extract_pdf_content(pdf_file)

data_path = Path(DATA_DIR)


In [None]:
pdf_files = data_path.rglob("*.pdf")
for idx, pdf_file in enumerate(pdf_files):
    print(pdf_file)
    extract_pdf_content(pdf_file)

2025-12-15 16:24:48,684 - INFO - detected formats: [<InputFormat.PDF: 'pdf'>]
2025-12-15 16:24:48,687 - INFO - Going to convert document batch...
2025-12-15 16:24:48,688 - INFO - Initializing pipeline for StandardPdfPipeline with options hash f076e025fec69d17629b0f4921f3a2a3
2025-12-15 16:24:48,688 - INFO - Accelerator device: 'cuda:0'
[32m[INFO] 2025-12-15 16:24:48,698 [RapidOCR] base.py:22: Using engine_name: onnxruntime[0m
[32m[INFO] 2025-12-15 16:24:48,702 [RapidOCR] download_file.py:60: File exists and is valid: C:\Users\laxmi\anaconda3\envs\ml\Lib\site-packages\rapidocr\models\ch_PP-OCRv4_det_infer.onnx[0m
[32m[INFO] 2025-12-15 16:24:48,702 [RapidOCR] main.py:53: Using C:\Users\laxmi\anaconda3\envs\ml\Lib\site-packages\rapidocr\models\ch_PP-OCRv4_det_infer.onnx[0m
[32m[INFO] 2025-12-15 16:24:48,745 [RapidOCR] base.py:22: Using engine_name: onnxruntime[0m
[32m[INFO] 2025-12-15 16:24:48,747 [RapidOCR] download_file.py:60: File exists and is valid: C:\Users\laxmi\anaconda3\

data\rag-data\pdfs\amazon\amazon 10-k 2023.pdf


2025-12-15 16:24:50,225 - INFO - Accelerator device: 'cuda:0'
2025-12-15 16:24:50,715 - INFO - Processing document amazon 10-k 2023.pdf
2025-12-15 16:25:32,804 - INFO - Finished converting document amazon 10-k 2023.pdf in 44.12 sec.
2025-12-15 16:25:32,998 - INFO - detected formats: [<InputFormat.PDF: 'pdf'>]
2025-12-15 16:25:33,001 - INFO - Going to convert document batch...
2025-12-15 16:25:33,001 - INFO - Initializing pipeline for StandardPdfPipeline with options hash f076e025fec69d17629b0f4921f3a2a3
2025-12-15 16:25:33,002 - INFO - Accelerator device: 'cuda:0'
[32m[INFO] 2025-12-15 16:25:33,015 [RapidOCR] base.py:22: Using engine_name: onnxruntime[0m
[32m[INFO] 2025-12-15 16:25:33,019 [RapidOCR] download_file.py:60: File exists and is valid: C:\Users\laxmi\anaconda3\envs\ml\Lib\site-packages\rapidocr\models\ch_PP-OCRv4_det_infer.onnx[0m
[32m[INFO] 2025-12-15 16:25:33,019 [RapidOCR] main.py:53: Using C:\Users\laxmi\anaconda3\envs\ml\Lib\site-packages\rapidocr\models\ch_PP-OCRv4

data\rag-data\pdfs\amazon\amazon 10-k 2024.pdf


2025-12-15 16:25:34,220 - INFO - Accelerator device: 'cuda:0'
2025-12-15 16:25:34,529 - INFO - Processing document amazon 10-k 2024.pdf
2025-12-15 16:26:02,461 - INFO - Finished converting document amazon 10-k 2024.pdf in 29.46 sec.
2025-12-15 16:26:02,656 - INFO - detected formats: [<InputFormat.PDF: 'pdf'>]
2025-12-15 16:26:02,659 - INFO - Going to convert document batch...
2025-12-15 16:26:02,660 - INFO - Initializing pipeline for StandardPdfPipeline with options hash f076e025fec69d17629b0f4921f3a2a3
2025-12-15 16:26:02,661 - INFO - Accelerator device: 'cuda:0'
[32m[INFO] 2025-12-15 16:26:02,673 [RapidOCR] base.py:22: Using engine_name: onnxruntime[0m
[32m[INFO] 2025-12-15 16:26:02,677 [RapidOCR] download_file.py:60: File exists and is valid: C:\Users\laxmi\anaconda3\envs\ml\Lib\site-packages\rapidocr\models\ch_PP-OCRv4_det_infer.onnx[0m
[32m[INFO] 2025-12-15 16:26:02,678 [RapidOCR] main.py:53: Using C:\Users\laxmi\anaconda3\envs\ml\Lib\site-packages\rapidocr\models\ch_PP-OCRv4

data\rag-data\pdfs\amazon\amazon 10-q q1 2024.pdf


2025-12-15 16:26:02,917 - INFO - Auto OCR model selected rapidocr with onnxruntime.
2025-12-15 16:26:02,918 - INFO - Accelerator device: 'cuda:0'
2025-12-15 16:26:03,830 - INFO - Accelerator device: 'cuda:0'
2025-12-15 16:26:04,128 - INFO - Processing document amazon 10-q q1 2024.pdf
2025-12-15 16:26:18,977 - INFO - Finished converting document amazon 10-q q1 2024.pdf in 16.32 sec.
2025-12-15 16:26:19,075 - INFO - detected formats: [<InputFormat.PDF: 'pdf'>]
2025-12-15 16:26:19,077 - INFO - Going to convert document batch...
2025-12-15 16:26:19,078 - INFO - Initializing pipeline for StandardPdfPipeline with options hash f076e025fec69d17629b0f4921f3a2a3
2025-12-15 16:26:19,078 - INFO - Accelerator device: 'cuda:0'
[32m[INFO] 2025-12-15 16:26:19,087 [RapidOCR] base.py:22: Using engine_name: onnxruntime[0m
[32m[INFO] 2025-12-15 16:26:19,090 [RapidOCR] download_file.py:60: File exists and is valid: C:\Users\laxmi\anaconda3\envs\ml\Lib\site-packages\rapidocr\models\ch_PP-OCRv4_det_infer.

data\rag-data\pdfs\amazon\amazon 10-q q1 2025.pdf


2025-12-15 16:26:19,320 - INFO - Accelerator device: 'cuda:0'
2025-12-15 16:26:20,234 - INFO - Accelerator device: 'cuda:0'
2025-12-15 16:26:20,514 - INFO - Processing document amazon 10-q q1 2025.pdf
2025-12-15 16:26:34,721 - INFO - Finished converting document amazon 10-q q1 2025.pdf in 15.65 sec.
2025-12-15 16:26:34,819 - INFO - detected formats: [<InputFormat.PDF: 'pdf'>]
2025-12-15 16:26:34,821 - INFO - Going to convert document batch...
2025-12-15 16:26:34,822 - INFO - Initializing pipeline for StandardPdfPipeline with options hash f076e025fec69d17629b0f4921f3a2a3
2025-12-15 16:26:34,822 - INFO - Accelerator device: 'cuda:0'
[32m[INFO] 2025-12-15 16:26:34,831 [RapidOCR] base.py:22: Using engine_name: onnxruntime[0m
[32m[INFO] 2025-12-15 16:26:34,834 [RapidOCR] download_file.py:60: File exists and is valid: C:\Users\laxmi\anaconda3\envs\ml\Lib\site-packages\rapidocr\models\ch_PP-OCRv4_det_infer.onnx[0m
[32m[INFO] 2025-12-15 16:26:34,835 [RapidOCR] main.py:53: Using C:\Users\l

data\rag-data\pdfs\amazon\amazon 10-q q2 2024.pdf


2025-12-15 16:26:35,025 - INFO - Auto OCR model selected rapidocr with onnxruntime.
2025-12-15 16:26:35,026 - INFO - Accelerator device: 'cuda:0'
2025-12-15 16:26:36,105 - INFO - Accelerator device: 'cuda:0'
2025-12-15 16:26:36,386 - INFO - Processing document amazon 10-q q2 2024.pdf
2025-12-15 16:26:53,034 - INFO - Finished converting document amazon 10-q q2 2024.pdf in 18.22 sec.
2025-12-15 16:26:53,147 - INFO - detected formats: [<InputFormat.PDF: 'pdf'>]
2025-12-15 16:26:53,150 - INFO - Going to convert document batch...
2025-12-15 16:26:53,150 - INFO - Initializing pipeline for StandardPdfPipeline with options hash f076e025fec69d17629b0f4921f3a2a3
2025-12-15 16:26:53,151 - INFO - Accelerator device: 'cuda:0'
[32m[INFO] 2025-12-15 16:26:53,160 [RapidOCR] base.py:22: Using engine_name: onnxruntime[0m
[32m[INFO] 2025-12-15 16:26:53,163 [RapidOCR] download_file.py:60: File exists and is valid: C:\Users\laxmi\anaconda3\envs\ml\Lib\site-packages\rapidocr\models\ch_PP-OCRv4_det_infer.

data\rag-data\pdfs\amazon\amazon 10-q q2 2025.pdf


2025-12-15 16:26:54,388 - INFO - Accelerator device: 'cuda:0'
2025-12-15 16:26:54,668 - INFO - Processing document amazon 10-q q2 2025.pdf
2025-12-15 16:27:11,010 - INFO - Finished converting document amazon 10-q q2 2025.pdf in 17.86 sec.
2025-12-15 16:27:11,125 - INFO - detected formats: [<InputFormat.PDF: 'pdf'>]
2025-12-15 16:27:11,129 - INFO - Going to convert document batch...
2025-12-15 16:27:11,130 - INFO - Initializing pipeline for StandardPdfPipeline with options hash f076e025fec69d17629b0f4921f3a2a3
2025-12-15 16:27:11,130 - INFO - Accelerator device: 'cuda:0'
[32m[INFO] 2025-12-15 16:27:11,139 [RapidOCR] base.py:22: Using engine_name: onnxruntime[0m
[32m[INFO] 2025-12-15 16:27:11,142 [RapidOCR] download_file.py:60: File exists and is valid: C:\Users\laxmi\anaconda3\envs\ml\Lib\site-packages\rapidocr\models\ch_PP-OCRv4_det_infer.onnx[0m
[32m[INFO] 2025-12-15 16:27:11,143 [RapidOCR] main.py:53: Using C:\Users\laxmi\anaconda3\envs\ml\Lib\site-packages\rapidocr\models\ch_PP

data\rag-data\pdfs\amazon\amazon 10-q q3 2024.pdf


2025-12-15 16:27:12,319 - INFO - Accelerator device: 'cuda:0'
2025-12-15 16:27:12,602 - INFO - Processing document amazon 10-q q3 2024.pdf
2025-12-15 16:27:39,554 - INFO - Finished converting document amazon 10-q q3 2024.pdf in 28.43 sec.
2025-12-15 16:27:39,803 - INFO - detected formats: [<InputFormat.PDF: 'pdf'>]
2025-12-15 16:27:39,806 - INFO - Going to convert document batch...
2025-12-15 16:27:39,806 - INFO - Initializing pipeline for StandardPdfPipeline with options hash f076e025fec69d17629b0f4921f3a2a3
2025-12-15 16:27:39,807 - INFO - Accelerator device: 'cuda:0'
[32m[INFO] 2025-12-15 16:27:39,816 [RapidOCR] base.py:22: Using engine_name: onnxruntime[0m
[32m[INFO] 2025-12-15 16:27:39,819 [RapidOCR] download_file.py:60: File exists and is valid: C:\Users\laxmi\anaconda3\envs\ml\Lib\site-packages\rapidocr\models\ch_PP-OCRv4_det_infer.onnx[0m
[32m[INFO] 2025-12-15 16:27:39,819 [RapidOCR] main.py:53: Using C:\Users\laxmi\anaconda3\envs\ml\Lib\site-packages\rapidocr\models\ch_PP

data\rag-data\pdfs\apple\apple 10-k 2023.pdf


2025-12-15 16:27:40,065 - INFO - Auto OCR model selected rapidocr with onnxruntime.
2025-12-15 16:27:40,067 - INFO - Accelerator device: 'cuda:0'
2025-12-15 16:27:40,982 - INFO - Accelerator device: 'cuda:0'
2025-12-15 16:27:41,270 - INFO - Processing document apple 10-k 2023.pdf
2025-12-15 16:28:07,149 - INFO - Finished converting document apple 10-k 2023.pdf in 27.35 sec.
2025-12-15 16:28:18,567 - INFO - detected formats: [<InputFormat.PDF: 'pdf'>]
2025-12-15 16:28:18,571 - INFO - Going to convert document batch...
2025-12-15 16:28:18,571 - INFO - Initializing pipeline for StandardPdfPipeline with options hash f076e025fec69d17629b0f4921f3a2a3
2025-12-15 16:28:18,572 - INFO - Accelerator device: 'cuda:0'
[32m[INFO] 2025-12-15 16:28:18,580 [RapidOCR] base.py:22: Using engine_name: onnxruntime[0m
[32m[INFO] 2025-12-15 16:28:18,583 [RapidOCR] download_file.py:60: File exists and is valid: C:\Users\laxmi\anaconda3\envs\ml\Lib\site-packages\rapidocr\models\ch_PP-OCRv4_det_infer.onnx[0m

data\rag-data\pdfs\apple\apple 10-k 2024.pdf


2025-12-15 16:28:19,718 - INFO - Accelerator device: 'cuda:0'
2025-12-15 16:28:19,967 - INFO - Processing document apple 10-k 2024.pdf
2025-12-15 16:28:49,397 - INFO - Finished converting document apple 10-k 2024.pdf in 30.83 sec.
2025-12-15 16:29:07,377 - INFO - detected formats: [<InputFormat.PDF: 'pdf'>]
2025-12-15 16:29:07,379 - INFO - Going to convert document batch...
2025-12-15 16:29:07,380 - INFO - Initializing pipeline for StandardPdfPipeline with options hash f076e025fec69d17629b0f4921f3a2a3
2025-12-15 16:29:07,381 - INFO - Accelerator device: 'cuda:0'
[32m[INFO] 2025-12-15 16:29:07,389 [RapidOCR] base.py:22: Using engine_name: onnxruntime[0m
[32m[INFO] 2025-12-15 16:29:07,393 [RapidOCR] download_file.py:60: File exists and is valid: C:\Users\laxmi\anaconda3\envs\ml\Lib\site-packages\rapidocr\models\ch_PP-OCRv4_det_infer.onnx[0m
[32m[INFO] 2025-12-15 16:29:07,393 [RapidOCR] main.py:53: Using C:\Users\laxmi\anaconda3\envs\ml\Lib\site-packages\rapidocr\models\ch_PP-OCRv4_d

data\rag-data\pdfs\apple\apple 10-q q1 2024.pdf


2025-12-15 16:29:07,616 - INFO - Auto OCR model selected rapidocr with onnxruntime.
2025-12-15 16:29:07,616 - INFO - Accelerator device: 'cuda:0'
2025-12-15 16:29:08,586 - INFO - Accelerator device: 'cuda:0'
2025-12-15 16:29:08,867 - INFO - Processing document apple 10-q q1 2024.pdf
2025-12-15 16:29:19,599 - INFO - Finished converting document apple 10-q q1 2024.pdf in 12.22 sec.
2025-12-15 16:29:19,667 - INFO - detected formats: [<InputFormat.PDF: 'pdf'>]
2025-12-15 16:29:19,679 - INFO - Going to convert document batch...
2025-12-15 16:29:19,680 - INFO - Initializing pipeline for StandardPdfPipeline with options hash f076e025fec69d17629b0f4921f3a2a3
2025-12-15 16:29:19,680 - INFO - Accelerator device: 'cuda:0'
[32m[INFO] 2025-12-15 16:29:19,690 [RapidOCR] base.py:22: Using engine_name: onnxruntime[0m
[32m[INFO] 2025-12-15 16:29:19,693 [RapidOCR] download_file.py:60: File exists and is valid: C:\Users\laxmi\anaconda3\envs\ml\Lib\site-packages\rapidocr\models\ch_PP-OCRv4_det_infer.on

data\rag-data\pdfs\apple\apple 10-q q2 2024.pdf


2025-12-15 16:29:20,951 - INFO - Accelerator device: 'cuda:0'
2025-12-15 16:29:21,215 - INFO - Processing document apple 10-q q2 2024.pdf
2025-12-15 16:29:31,809 - INFO - Finished converting document apple 10-q q2 2024.pdf in 12.14 sec.
2025-12-15 16:29:31,885 - INFO - detected formats: [<InputFormat.PDF: 'pdf'>]
2025-12-15 16:29:31,887 - INFO - Going to convert document batch...
2025-12-15 16:29:31,887 - INFO - Initializing pipeline for StandardPdfPipeline with options hash f076e025fec69d17629b0f4921f3a2a3
2025-12-15 16:29:31,888 - INFO - Accelerator device: 'cuda:0'
[32m[INFO] 2025-12-15 16:29:31,897 [RapidOCR] base.py:22: Using engine_name: onnxruntime[0m
[32m[INFO] 2025-12-15 16:29:31,900 [RapidOCR] download_file.py:60: File exists and is valid: C:\Users\laxmi\anaconda3\envs\ml\Lib\site-packages\rapidocr\models\ch_PP-OCRv4_det_infer.onnx[0m
[32m[INFO] 2025-12-15 16:29:31,901 [RapidOCR] main.py:53: Using C:\Users\laxmi\anaconda3\envs\ml\Lib\site-packages\rapidocr\models\ch_PP-O

data\rag-data\pdfs\apple\apple 10-q q4 2023.pdf


2025-12-15 16:29:33,127 - INFO - Accelerator device: 'cuda:0'
2025-12-15 16:29:33,412 - INFO - Processing document apple 10-q q4 2023.pdf
2025-12-15 16:29:42,815 - INFO - Finished converting document apple 10-q q4 2023.pdf in 10.93 sec.
2025-12-15 16:29:42,882 - INFO - detected formats: [<InputFormat.PDF: 'pdf'>]
2025-12-15 16:29:42,883 - INFO - Going to convert document batch...
2025-12-15 16:29:42,884 - INFO - Initializing pipeline for StandardPdfPipeline with options hash f076e025fec69d17629b0f4921f3a2a3
2025-12-15 16:29:42,885 - INFO - Accelerator device: 'cuda:0'
[32m[INFO] 2025-12-15 16:29:42,894 [RapidOCR] base.py:22: Using engine_name: onnxruntime[0m
[32m[INFO] 2025-12-15 16:29:42,897 [RapidOCR] download_file.py:60: File exists and is valid: C:\Users\laxmi\anaconda3\envs\ml\Lib\site-packages\rapidocr\models\ch_PP-OCRv4_det_infer.onnx[0m
[32m[INFO] 2025-12-15 16:29:42,898 [RapidOCR] main.py:53: Using C:\Users\laxmi\anaconda3\envs\ml\Lib\site-packages\rapidocr\models\ch_PP-O

data\rag-data\pdfs\apple\apple 8-k q4 2023.pdf


2025-12-15 16:29:43,116 - INFO - Accelerator device: 'cuda:0'
2025-12-15 16:29:44,014 - INFO - Accelerator device: 'cuda:0'
2025-12-15 16:29:44,305 - INFO - Processing document apple 8-k q4 2023.pdf
2025-12-15 16:29:47,100 - INFO - Finished converting document apple 8-k q4 2023.pdf in 4.22 sec.
2025-12-15 16:29:47,132 - INFO - detected formats: [<InputFormat.PDF: 'pdf'>]
2025-12-15 16:29:47,136 - INFO - Going to convert document batch...
2025-12-15 16:29:47,136 - INFO - Initializing pipeline for StandardPdfPipeline with options hash f076e025fec69d17629b0f4921f3a2a3
2025-12-15 16:29:47,137 - INFO - Accelerator device: 'cuda:0'
[32m[INFO] 2025-12-15 16:29:47,146 [RapidOCR] base.py:22: Using engine_name: onnxruntime[0m
[32m[INFO] 2025-12-15 16:29:47,149 [RapidOCR] download_file.py:60: File exists and is valid: C:\Users\laxmi\anaconda3\envs\ml\Lib\site-packages\rapidocr\models\ch_PP-OCRv4_det_infer.onnx[0m
[32m[INFO] 2025-12-15 16:29:47,150 [RapidOCR] main.py:53: Using C:\Users\laxmi\

data\rag-data\pdfs\google\google 10-k 2023.pdf


2025-12-15 16:29:47,374 - INFO - Accelerator device: 'cuda:0'
2025-12-15 16:29:48,358 - INFO - Accelerator device: 'cuda:0'
2025-12-15 16:29:48,596 - INFO - Processing document google 10-k 2023.pdf
2025-12-15 16:30:23,932 - INFO - Finished converting document google 10-k 2023.pdf in 36.80 sec.
2025-12-15 16:31:02,692 - INFO - detected formats: [<InputFormat.PDF: 'pdf'>]
2025-12-15 16:31:02,699 - INFO - Going to convert document batch...
2025-12-15 16:31:02,701 - INFO - Initializing pipeline for StandardPdfPipeline with options hash f076e025fec69d17629b0f4921f3a2a3
2025-12-15 16:31:02,703 - INFO - Accelerator device: 'cuda:0'
[32m[INFO] 2025-12-15 16:31:02,713 [RapidOCR] base.py:22: Using engine_name: onnxruntime[0m
[32m[INFO] 2025-12-15 16:31:02,716 [RapidOCR] download_file.py:60: File exists and is valid: C:\Users\laxmi\anaconda3\envs\ml\Lib\site-packages\rapidocr\models\ch_PP-OCRv4_det_infer.onnx[0m
[32m[INFO] 2025-12-15 16:31:02,717 [RapidOCR] main.py:53: Using C:\Users\laxmi\a

data\rag-data\pdfs\google\google 10-k 2024.pdf


2025-12-15 16:31:02,954 - INFO - Auto OCR model selected rapidocr with onnxruntime.
2025-12-15 16:31:02,954 - INFO - Accelerator device: 'cuda:0'
2025-12-15 16:31:03,892 - INFO - Accelerator device: 'cuda:0'
2025-12-15 16:31:04,146 - INFO - Processing document google 10-k 2024.pdf
2025-12-15 16:31:41,926 - INFO - Finished converting document google 10-k 2024.pdf in 39.23 sec.
2025-12-15 16:32:22,257 - INFO - detected formats: [<InputFormat.PDF: 'pdf'>]
2025-12-15 16:32:22,259 - INFO - Going to convert document batch...
2025-12-15 16:32:22,260 - INFO - Initializing pipeline for StandardPdfPipeline with options hash f076e025fec69d17629b0f4921f3a2a3
2025-12-15 16:32:22,261 - INFO - Accelerator device: 'cuda:0'
[32m[INFO] 2025-12-15 16:32:22,272 [RapidOCR] base.py:22: Using engine_name: onnxruntime[0m
[32m[INFO] 2025-12-15 16:32:22,277 [RapidOCR] download_file.py:60: File exists and is valid: C:\Users\laxmi\anaconda3\envs\ml\Lib\site-packages\rapidocr\models\ch_PP-OCRv4_det_infer.onnx[

data\rag-data\pdfs\google\google 10-q q1 2025.pdf


2025-12-15 16:32:22,495 - INFO - Auto OCR model selected rapidocr with onnxruntime.
2025-12-15 16:32:22,496 - INFO - Accelerator device: 'cuda:0'
2025-12-15 16:32:23,469 - INFO - Accelerator device: 'cuda:0'
2025-12-15 16:32:23,710 - INFO - Processing document google 10-q q1 2025.pdf
2025-12-15 16:32:44,905 - INFO - Finished converting document google 10-q q1 2025.pdf in 22.65 sec.
2025-12-15 16:32:45,044 - INFO - detected formats: [<InputFormat.PDF: 'pdf'>]
2025-12-15 16:32:45,047 - INFO - Going to convert document batch...
2025-12-15 16:32:45,048 - INFO - Initializing pipeline for StandardPdfPipeline with options hash f076e025fec69d17629b0f4921f3a2a3
2025-12-15 16:32:45,048 - INFO - Accelerator device: 'cuda:0'
[32m[INFO] 2025-12-15 16:32:45,061 [RapidOCR] base.py:22: Using engine_name: onnxruntime[0m
[32m[INFO] 2025-12-15 16:32:45,065 [RapidOCR] download_file.py:60: File exists and is valid: C:\Users\laxmi\anaconda3\envs\ml\Lib\site-packages\rapidocr\models\ch_PP-OCRv4_det_infer.

data\rag-data\pdfs\google\google 10-q q2 2024.pdf


2025-12-15 16:32:46,271 - INFO - Accelerator device: 'cuda:0'
2025-12-15 16:32:46,552 - INFO - Processing document google 10-q q2 2024.pdf
2025-12-15 16:33:11,604 - INFO - Finished converting document google 10-q q2 2024.pdf in 26.56 sec.
2025-12-15 16:33:11,746 - INFO - detected formats: [<InputFormat.PDF: 'pdf'>]
2025-12-15 16:33:11,748 - INFO - Going to convert document batch...
2025-12-15 16:33:11,749 - INFO - Initializing pipeline for StandardPdfPipeline with options hash f076e025fec69d17629b0f4921f3a2a3
2025-12-15 16:33:11,750 - INFO - Accelerator device: 'cuda:0'
[32m[INFO] 2025-12-15 16:33:11,762 [RapidOCR] base.py:22: Using engine_name: onnxruntime[0m
[32m[INFO] 2025-12-15 16:33:11,765 [RapidOCR] download_file.py:60: File exists and is valid: C:\Users\laxmi\anaconda3\envs\ml\Lib\site-packages\rapidocr\models\ch_PP-OCRv4_det_infer.onnx[0m
[32m[INFO] 2025-12-15 16:33:11,766 [RapidOCR] main.py:53: Using C:\Users\laxmi\anaconda3\envs\ml\Lib\site-packages\rapidocr\models\ch_PP

data\rag-data\pdfs\google\google 10-q q2 2025.pdf


2025-12-15 16:33:12,945 - INFO - Accelerator device: 'cuda:0'
2025-12-15 16:33:13,248 - INFO - Processing document google 10-q q2 2025.pdf
2025-12-15 16:33:40,215 - INFO - Finished converting document google 10-q q2 2025.pdf in 28.47 sec.
2025-12-15 16:33:40,365 - INFO - detected formats: [<InputFormat.PDF: 'pdf'>]
2025-12-15 16:33:40,367 - INFO - Going to convert document batch...
2025-12-15 16:33:40,368 - INFO - Initializing pipeline for StandardPdfPipeline with options hash f076e025fec69d17629b0f4921f3a2a3
2025-12-15 16:33:40,368 - INFO - Accelerator device: 'cuda:0'
[32m[INFO] 2025-12-15 16:33:40,380 [RapidOCR] base.py:22: Using engine_name: onnxruntime[0m
[32m[INFO] 2025-12-15 16:33:40,383 [RapidOCR] download_file.py:60: File exists and is valid: C:\Users\laxmi\anaconda3\envs\ml\Lib\site-packages\rapidocr\models\ch_PP-OCRv4_det_infer.onnx[0m
[32m[INFO] 2025-12-15 16:33:40,384 [RapidOCR] main.py:53: Using C:\Users\laxmi\anaconda3\envs\ml\Lib\site-packages\rapidocr\models\ch_PP

data\rag-data\pdfs\google\google 10-q q3 2024.pdf


2025-12-15 16:33:41,579 - INFO - Accelerator device: 'cuda:0'
2025-12-15 16:33:41,860 - INFO - Processing document google 10-q q3 2024.pdf
2025-12-15 16:34:05,151 - INFO - Finished converting document google 10-q q3 2024.pdf in 24.79 sec.
2025-12-15 16:34:05,297 - INFO - detected formats: [<InputFormat.PDF: 'pdf'>]
2025-12-15 16:34:05,304 - INFO - Going to convert document batch...
2025-12-15 16:34:05,304 - INFO - Initializing pipeline for StandardPdfPipeline with options hash f076e025fec69d17629b0f4921f3a2a3
2025-12-15 16:34:05,305 - INFO - Accelerator device: 'cuda:0'
[32m[INFO] 2025-12-15 16:34:05,317 [RapidOCR] base.py:22: Using engine_name: onnxruntime[0m
[32m[INFO] 2025-12-15 16:34:05,321 [RapidOCR] download_file.py:60: File exists and is valid: C:\Users\laxmi\anaconda3\envs\ml\Lib\site-packages\rapidocr\models\ch_PP-OCRv4_det_infer.onnx[0m
[32m[INFO] 2025-12-15 16:34:05,322 [RapidOCR] main.py:53: Using C:\Users\laxmi\anaconda3\envs\ml\Lib\site-packages\rapidocr\models\ch_PP

data\rag-data\pdfs\meta\meta 10-k 2024.pdf


2025-12-15 16:34:06,512 - INFO - Accelerator device: 'cuda:0'
2025-12-15 16:34:06,802 - INFO - Processing document meta 10-k 2024.pdf
2025-12-15 16:35:04,336 - INFO - Finished converting document meta 10-k 2024.pdf in 59.04 sec.
2025-12-15 16:35:57,982 - INFO - detected formats: [<InputFormat.PDF: 'pdf'>]
2025-12-15 16:35:57,986 - INFO - Going to convert document batch...
2025-12-15 16:35:57,988 - INFO - Initializing pipeline for StandardPdfPipeline with options hash f076e025fec69d17629b0f4921f3a2a3
2025-12-15 16:35:57,990 - INFO - Accelerator device: 'cuda:0'
[32m[INFO] 2025-12-15 16:35:58,000 [RapidOCR] base.py:22: Using engine_name: onnxruntime[0m
[32m[INFO] 2025-12-15 16:35:58,004 [RapidOCR] download_file.py:60: File exists and is valid: C:\Users\laxmi\anaconda3\envs\ml\Lib\site-packages\rapidocr\models\ch_PP-OCRv4_det_infer.onnx[0m
[32m[INFO] 2025-12-15 16:35:58,004 [RapidOCR] main.py:53: Using C:\Users\laxmi\anaconda3\envs\ml\Lib\site-packages\rapidocr\models\ch_PP-OCRv4_det

data\rag-data\pdfs\meta\meta 10-q q1 2024.pdf


2025-12-15 16:35:59,192 - INFO - Accelerator device: 'cuda:0'
2025-12-15 16:35:59,527 - INFO - Processing document meta 10-q q1 2024.pdf
2025-12-15 16:36:07,265 - INFO - Finished converting document meta 10-q q1 2024.pdf in 9.28 sec.
2025-12-15 16:36:37,732 - INFO - detected formats: [<InputFormat.PDF: 'pdf'>]
2025-12-15 16:36:37,736 - INFO - Going to convert document batch...
2025-12-15 16:36:37,738 - INFO - Initializing pipeline for StandardPdfPipeline with options hash f076e025fec69d17629b0f4921f3a2a3
2025-12-15 16:36:37,739 - INFO - Accelerator device: 'cuda:0'
[32m[INFO] 2025-12-15 16:36:37,750 [RapidOCR] base.py:22: Using engine_name: onnxruntime[0m
[32m[INFO] 2025-12-15 16:36:37,754 [RapidOCR] download_file.py:60: File exists and is valid: C:\Users\laxmi\anaconda3\envs\ml\Lib\site-packages\rapidocr\models\ch_PP-OCRv4_det_infer.onnx[0m
[32m[INFO] 2025-12-15 16:36:37,755 [RapidOCR] main.py:53: Using C:\Users\laxmi\anaconda3\envs\ml\Lib\site-packages\rapidocr\models\ch_PP-OCRv

data\rag-data\pdfs\meta\meta 10-q q1 2025.pdf


2025-12-15 16:36:38,905 - INFO - Accelerator device: 'cuda:0'
2025-12-15 16:36:39,150 - INFO - Processing document meta 10-q q1 2025.pdf
2025-12-15 16:36:46,897 - INFO - Finished converting document meta 10-q q1 2025.pdf in 9.17 sec.
2025-12-15 16:37:15,030 - INFO - detected formats: [<InputFormat.PDF: 'pdf'>]
2025-12-15 16:37:15,043 - INFO - Going to convert document batch...
2025-12-15 16:37:15,044 - INFO - Initializing pipeline for StandardPdfPipeline with options hash f076e025fec69d17629b0f4921f3a2a3
2025-12-15 16:37:15,045 - INFO - Accelerator device: 'cuda:0'
[32m[INFO] 2025-12-15 16:37:15,056 [RapidOCR] base.py:22: Using engine_name: onnxruntime[0m
[32m[INFO] 2025-12-15 16:37:15,059 [RapidOCR] download_file.py:60: File exists and is valid: C:\Users\laxmi\anaconda3\envs\ml\Lib\site-packages\rapidocr\models\ch_PP-OCRv4_det_infer.onnx[0m
[32m[INFO] 2025-12-15 16:37:15,060 [RapidOCR] main.py:53: Using C:\Users\laxmi\anaconda3\envs\ml\Lib\site-packages\rapidocr\models\ch_PP-OCRv

data\rag-data\pdfs\meta\meta 10-q q2 2024.pdf


2025-12-15 16:37:16,242 - INFO - Accelerator device: 'cuda:0'
2025-12-15 16:37:16,533 - INFO - Processing document meta 10-q q2 2024.pdf
2025-12-15 16:37:24,312 - INFO - Finished converting document meta 10-q q2 2024.pdf in 9.28 sec.
2025-12-15 16:38:12,320 - INFO - detected formats: [<InputFormat.PDF: 'pdf'>]
2025-12-15 16:38:12,324 - INFO - Going to convert document batch...
2025-12-15 16:38:12,326 - INFO - Initializing pipeline for StandardPdfPipeline with options hash f076e025fec69d17629b0f4921f3a2a3
2025-12-15 16:38:12,327 - INFO - Accelerator device: 'cuda:0'
[32m[INFO] 2025-12-15 16:38:12,338 [RapidOCR] base.py:22: Using engine_name: onnxruntime[0m
[32m[INFO] 2025-12-15 16:38:12,342 [RapidOCR] download_file.py:60: File exists and is valid: C:\Users\laxmi\anaconda3\envs\ml\Lib\site-packages\rapidocr\models\ch_PP-OCRv4_det_infer.onnx[0m
[32m[INFO] 2025-12-15 16:38:12,343 [RapidOCR] main.py:53: Using C:\Users\laxmi\anaconda3\envs\ml\Lib\site-packages\rapidocr\models\ch_PP-OCRv

data\rag-data\pdfs\meta\meta 10-q q2 2025.pdf


2025-12-15 16:38:12,542 - INFO - Accelerator device: 'cuda:0'
2025-12-15 16:38:13,558 - INFO - Accelerator device: 'cuda:0'
2025-12-15 16:38:13,862 - INFO - Processing document meta 10-q q2 2025.pdf
2025-12-15 16:38:21,907 - INFO - Finished converting document meta 10-q q2 2025.pdf in 9.59 sec.
2025-12-15 16:38:51,686 - INFO - detected formats: [<InputFormat.PDF: 'pdf'>]
2025-12-15 16:38:51,690 - INFO - Going to convert document batch...
2025-12-15 16:38:51,690 - INFO - Initializing pipeline for StandardPdfPipeline with options hash f076e025fec69d17629b0f4921f3a2a3
2025-12-15 16:38:51,691 - INFO - Accelerator device: 'cuda:0'
[32m[INFO] 2025-12-15 16:38:51,700 [RapidOCR] base.py:22: Using engine_name: onnxruntime[0m
[32m[INFO] 2025-12-15 16:38:51,703 [RapidOCR] download_file.py:60: File exists and is valid: C:\Users\laxmi\anaconda3\envs\ml\Lib\site-packages\rapidocr\models\ch_PP-OCRv4_det_infer.onnx[0m
[32m[INFO] 2025-12-15 16:38:51,704 [RapidOCR] main.py:53: Using C:\Users\laxmi\

data\rag-data\pdfs\meta\meta 10-q q3 2024.pdf


2025-12-15 16:38:51,940 - INFO - Auto OCR model selected rapidocr with onnxruntime.
2025-12-15 16:38:51,940 - INFO - Accelerator device: 'cuda:0'
2025-12-15 16:38:52,924 - INFO - Accelerator device: 'cuda:0'
2025-12-15 16:38:53,186 - INFO - Processing document meta 10-q q3 2024.pdf
2025-12-15 16:39:00,939 - INFO - Finished converting document meta 10-q q3 2024.pdf in 9.25 sec.
2025-12-15 16:39:49,671 - INFO - detected formats: [<InputFormat.PDF: 'pdf'>]
2025-12-15 16:39:49,685 - INFO - Going to convert document batch...
2025-12-15 16:39:49,686 - INFO - Initializing pipeline for StandardPdfPipeline with options hash f076e025fec69d17629b0f4921f3a2a3
2025-12-15 16:39:49,686 - INFO - Accelerator device: 'cuda:0'
[32m[INFO] 2025-12-15 16:39:49,699 [RapidOCR] base.py:22: Using engine_name: onnxruntime[0m
[32m[INFO] 2025-12-15 16:39:49,702 [RapidOCR] download_file.py:60: File exists and is valid: C:\Users\laxmi\anaconda3\envs\ml\Lib\site-packages\rapidocr\models\ch_PP-OCRv4_det_infer.onnx

data\rag-data\pdfs\meta\meta 10-q q3 2025.pdf


2025-12-15 16:39:49,933 - INFO - Auto OCR model selected rapidocr with onnxruntime.
2025-12-15 16:39:49,934 - INFO - Accelerator device: 'cuda:0'
2025-12-15 16:39:50,872 - INFO - Accelerator device: 'cuda:0'
2025-12-15 16:39:51,118 - INFO - Processing document meta 10-q q3 2025.pdf
2025-12-15 16:39:59,302 - INFO - Finished converting document meta 10-q q3 2025.pdf in 9.63 sec.
2025-12-15 16:40:43,536 - INFO - detected formats: [<InputFormat.PDF: 'pdf'>]
2025-12-15 16:40:43,546 - INFO - Going to convert document batch...
2025-12-15 16:40:43,547 - INFO - Initializing pipeline for StandardPdfPipeline with options hash f076e025fec69d17629b0f4921f3a2a3
2025-12-15 16:40:43,547 - INFO - Accelerator device: 'cuda:0'
[32m[INFO] 2025-12-15 16:40:43,557 [RapidOCR] base.py:22: Using engine_name: onnxruntime[0m
[32m[INFO] 2025-12-15 16:40:43,560 [RapidOCR] download_file.py:60: File exists and is valid: C:\Users\laxmi\anaconda3\envs\ml\Lib\site-packages\rapidocr\models\ch_PP-OCRv4_det_infer.onnx

data\rag-data\pdfs\meta\meta 10-q q4 2024.pdf


2025-12-15 16:40:43,745 - INFO - Auto OCR model selected rapidocr with onnxruntime.
2025-12-15 16:40:43,745 - INFO - Accelerator device: 'cuda:0'
2025-12-15 16:40:44,747 - INFO - Accelerator device: 'cuda:0'
2025-12-15 16:40:45,025 - INFO - Processing document meta 10-q q4 2024.pdf
2025-12-15 16:40:52,733 - INFO - Finished converting document meta 10-q q4 2024.pdf in 9.20 sec.
2025-12-15 16:41:19,890 - INFO - detected formats: [<InputFormat.PDF: 'pdf'>]
2025-12-15 16:41:19,899 - INFO - Going to convert document batch...
2025-12-15 16:41:19,900 - INFO - Initializing pipeline for StandardPdfPipeline with options hash f076e025fec69d17629b0f4921f3a2a3
2025-12-15 16:41:19,901 - INFO - Accelerator device: 'cuda:0'
[32m[INFO] 2025-12-15 16:41:19,911 [RapidOCR] base.py:22: Using engine_name: onnxruntime[0m
[32m[INFO] 2025-12-15 16:41:19,914 [RapidOCR] download_file.py:60: File exists and is valid: C:\Users\laxmi\anaconda3\envs\ml\Lib\site-packages\rapidocr\models\ch_PP-OCRv4_det_infer.onnx

data\rag-data\pdfs\meta\meta10-k 2023.pdf


2025-12-15 16:41:20,129 - INFO - Auto OCR model selected rapidocr with onnxruntime.
2025-12-15 16:41:20,129 - INFO - Accelerator device: 'cuda:0'
2025-12-15 16:41:21,162 - INFO - Accelerator device: 'cuda:0'
2025-12-15 16:41:21,414 - INFO - Processing document meta10-k 2023.pdf
2025-12-15 16:42:28,167 - INFO - Finished converting document meta10-k 2023.pdf in 68.28 sec.
