In [1]:
import sys
from marker.converters.pdf import PdfConverter
from marker.output import text_from_rendered
from marker.models import create_model_dict
from marker.config.parser import ConfigParser
from tqdm.autonotebook import tqdm as notebook_tqdm

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# === Placeholder Configuration ===
# Replace the placeholder below with your actual file path.
INPUT_PATH = "data/1_english.jpg"
# Set the desired output format: "markdown", "json", or "html"
OUTPUT_FORMAT = "markdown"
# Set to True to boost accuracy via LLM (requires API key configuration)
USE_LLM = False
# ================================

In [3]:
def extract_with_marker(input_path: str, output_format: str = "markdown", use_llm: bool = False) -> str:
    artifact_dict = create_model_dict()
    config = {"output_format": output_format}
    if use_llm:
        config["use_llm"] = True
    config_parser = ConfigParser(config)

    converter = PdfConverter(
        config=config_parser.generate_config_dict(),
        artifact_dict=artifact_dict,
        processor_list=config_parser.get_processors(),
        renderer=config_parser.get_renderer(),
        llm_service=config_parser.get_llm_service() if use_llm else None
    )
    
    rendered = converter(input_path)
    text, _, _ = text_from_rendered(rendered)
    return text

In [4]:
if __name__ == "__main__":
    # Use the placeholder INPUT_PATH unless a CLI argument is provided
    file_path = INPUT_PATH

    result = extract_with_marker(
        file_path,
        output_format=OUTPUT_FORMAT,
        use_llm=USE_LLM
    )
    print(result)

Loaded layout model s3://layout/2025_02_18 on device cpu with dtype torch.float32
Loaded texify model s3://texify/2025_02_18 on device cpu with dtype torch.float32
Loaded recognition model s3://text_recognition/2025_02_18 on device cpu with dtype torch.float32
Loaded table recognition model s3://table_recognition/2025_02_18 on device cpu with dtype torch.float32
Loaded detection model s3://text_detection/2025_02_28 on device cpu with dtype torch.float32
Loaded detection model s3://inline_math_detection/2025_02_24 on device cpu with dtype torch.float32


Recognizing layout: 100%|██████████| 1/1 [00:04<00:00,  4.98s/it]
Running OCR Error Detection: 100%|██████████| 1/1 [00:00<00:00,  9.51it/s]
Detecting bboxes: 100%|██████████| 1/1 [00:03<00:00,  3.74s/it]
Recognizing Text: 100%|██████████| 1/1 [00:16<00:00, 16.12s/it]
Detecting bboxes: 0it [00:00, ?it/s]


## Lorem Ipsum

## Meaning

Lorem Ipsum is essentially the typeset and printing industry's dummy text. Since an unidentified printer jumbled a galley of type to create a type specimen book in the 1500s, Lorem Ipsum has been the industry standard sham text.

It is virtually intact, having withstood not just five centuries but also the transition to electronic typesetting.

The introduction of Letraset sheets with sections from Lorem Ipsum in the 1960s and, more recently, the inclusion of Lorem Ipsum versions in desktop publishing programs like Aldus PageMaker contributed to its popularization.
