# SmolDocling

In [6]:
# Install dependencies
!pip install transformers torch docling_core markdown2

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Collecting markdown2
  Downloading markdown2-2.5.3-py3-none-any.whl.metadata (2.1 kB)
Downloading markdown2-2.5.3-py3-none-any.whl (48 kB)
Installing collected packages: markdown2
Successfully installed markdown2-2.5.3


# Interpret image (should reduce tokens to 256 because it takes a lot of tokens to describe the image)

In [None]:
import torch
from docling_core.types.doc import DoclingDocument
from docling_core.types.doc.document import DocTagsDocument
from transformers import AutoProcessor, AutoModelForVision2Seq
from transformers.image_utils import load_image

DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
# DEVICE = "cpu"

# Load images
image = load_image("sample.png")

# Initialize processor and model
processor = AutoProcessor.from_pretrained("ds4sd/SmolDocling-256M-preview")
# model = AutoModelForVision2Seq.from_pretrained(
#     "ds4sd/SmolDocling-256M-preview",
#     torch_dtype=torch.bfloat16,
#     _attn_implementation="eager",  # for gpu that does not supports flash attention
# ).to(DEVICE)

model = AutoModelForVision2Seq.from_pretrained(
    "ds4sd/SmolDocling-256M-preview",
    torch_dtype=torch.float16 if DEVICE == "cuda" else torch.float32,
    _attn_implementation="eager",
).to(DEVICE)

# Create input messages
messages = [
    {
        "role": "user",
        "content": [
            {"type": "image"},
            {"type": "text", "text": "You are a document generation model. Please generate a document from the image."},
            # {"type": "text", "text": "Extract the tables from this image and return them as HTML <table> tags."},
        ]
    },
]


# Prepare inputs
prompt = processor.apply_chat_template(messages, add_generation_prompt=True)
inputs = processor(text=prompt, images=[image], return_tensors="pt")
inputs = inputs.to(DEVICE)

# Generate outputs
max_new_tokens = 256
# generated_ids = model.generate(**inputs, max_new_tokens=8192)
generated_ids = model.generate(**inputs, max_new_tokens=max_new_tokens)

prompt_length = inputs.input_ids.shape[1]
trimmed_generated_ids = generated_ids[:, prompt_length:]
doctags = processor.batch_decode(
    trimmed_generated_ids,
    skip_special_tokens=False,
)[0].lstrip()

# Populate document
doctags_doc = DocTagsDocument.from_doctags_and_image_pairs([doctags], [image])
print(doctags)
# create a docling document
doc = DoclingDocument(name="Document")
doc.load_from_doctags(doctags_doc)

print(doc.export_to_markdown())

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


<chart><loc_0><loc_0><loc_500><loc_500>ID Name Colour Description 1 Tag 1 2 Tag 2 3 Tag 3 3 Tag 3 4 Tag 4 5 Tag 5 6 Tag 6 7 Tag 7 8 Tag 8 9 Tag 9 10 Tag 10 11 Tag 11 12 Tag 12 13 Tag 13 14 Tag 14 15 Tag 15 16 Tag 16 17 Tag 17 18 Tag 18 19 Tag 19 20 Tag 20 21 Tag 21 22 Tag 22 23 Tag 23 24 Tag 24 25 Tag 25 26 Tag 26 27 Tag 27 28 Tag 28 29 Tag 29 30 Tag 30 31 Tag 31 32 Tag 32 33 Tag 33 34 Tag 34 35 Tag 35 3



# Extract tables from pdf by converting all pages to images and then using OCR

In [19]:
from pdf2image import convert_from_path
from transformers import AutoProcessor, AutoModelForVision2Seq
from transformers.image_utils import load_image
from PIL import Image
import torch

DEVICE = "cuda" if torch.cuda.is_available() else "cpu"

processor = AutoProcessor.from_pretrained("ds4sd/SmolDocling-256M-preview")
model = AutoModelForVision2Seq.from_pretrained(
    "ds4sd/SmolDocling-256M-preview",
    torch_dtype=torch.float16 if DEVICE == "cuda" else torch.float32,
    _attn_implementation="eager",
).to(DEVICE)

images = convert_from_path("./research_docs/TABLAS P-3 Comparativa de Estandares.pdf", dpi=300)

doctags = []
for i, page_image in enumerate(images):
    print(f"Processing page {i+1}...")

    page_image.save(f"page_{i+1}.png")

    messages = [
        {
            "role": "user",
            "content": [
                {"type": "image"},
                {"type": "text", "text": "Extract all tables from the document in HTML format <table>, only return <table> tags."},
            ],
        },
    ]

    prompt = processor.apply_chat_template(messages, add_generation_prompt=True)
    inputs = processor(text=prompt, images=[page_image], return_tensors="pt").to(DEVICE)

    generated_ids = model.generate(**inputs, max_new_tokens=512)
    prompt_len = inputs.input_ids.shape[1]
    trimmed_ids = generated_ids[:, prompt_len:]

    doctag = processor.batch_decode(trimmed_ids, skip_special_tokens=False)[0].lstrip()
    doctags.append((doctag, page_image))
    print(f"\n🧾 Page {i+1} (HTML extracted):\n", doctag)

Processing page 1...


Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.



🧾 Page 1 (HTML extracted):
 <otsl><loc_83><loc_47><loc_454><loc_360>TABLA COMPARATIVA DE ESTANDARES  USO DEL EDIFICIO  FUENTE DE  INFORMACION  POBLACION  x Dormitorio  %  PoblACION  servida cada 5  %  LIMITE DE  TIEMPO DE  ESPERA  VIVIENDA  Ref. 1  Ref. 2  Ref. 1  Ref. 2  Ref. 1  Ref. 1  Ref. 1  Ref. 1  Ref. 1  Ref. 1  Ref. 1  Ref. 1  Ref. 1  Ref. 1  Ref. 1  Ref. 1  Ref. 1  Ref. 1  Ref. 1  Ref. 1  Ref. 1  Ref. 1  Ref. 1  Ref. 1  Ref. 1  Ref. 1  Ref. 1  Ref. 1  Ref. 1  Ref. 1  Ref. 1  Ref. 1  Ref. 1  Ref. 1  Ref. 1  Ref. 1  Ref. 1  Ref. 1  Ref. 1  Ref. 1  Ref. 1  Ref. 1  Ref. 1  Ref. 1  Ref. 1  Ref. 1  Ref. 1  Ref. 1  Ref. 1  Ref. 1  Ref. 1  Ref. 1  Ref. 1  Ref. 1  Ref. 1  Ref. 1  Ref. 1  Ref. 1  Ref. 1  Ref. 1  Ref. 1  Ref. 1  Ref. 1  Ref. 1  Ref. 1  Ref. 1  Ref. 1  Ref. 1  Ref. 1  Ref. 1  Ref. 1  Ref. 1  Ref. 1  Ref. 1  Ref. 1  Ref. 1  Ref. 1  Ref. 1  Ref. 1  Ref. 1  Ref. 1  Ref. 1  Ref. 1 


In [21]:
from docling_core.types.doc import DoclingDocument
from docling_core.types.doc.document import DocTagsDocument
from IPython.display import display, HTML

html_output = ""
for doctag, image in doctags:
    doctags_doc = DocTagsDocument.from_doctags_and_image_pairs([doctag], [image])

    doc = DoclingDocument(name="Document")
    doc.load_from_doctags(doctags_doc)

    html_output += doc.export_to_html()

# write the HTML output to a file
with open("output.html", "w") as f:
    f.write(html_output)

# display(HTML(html_output))