In [1]:
!apt-get update -qq
!apt-get install -y -qq poppler-utils

W: Skipping acquire of configured file 'main/source/Sources' as repository 'https://r2u.stat.illinois.edu/ubuntu jammy InRelease' does not seem to provide it (sources.list entry misspelt?)
Selecting previously unselected package poppler-utils.
(Reading database ... 126675 files and directories currently installed.)
Preparing to unpack .../poppler-utils_22.02.0-2ubuntu0.11_amd64.deb ...
Unpacking poppler-utils (22.02.0-2ubuntu0.11) ...
Setting up poppler-utils (22.02.0-2ubuntu0.11) ...
Processing triggers for man-db (2.10.2-1) ...


In [2]:
!pip install -q "mineru-vl-utils[transformers]" transformers pillow pdf2image torch --upgrade

[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/51.8 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m51.8/51.8 kB[0m [31m3.1 MB/s[0m eta [36m0:00:00[0m
[?25h

In [3]:
import os
import json
import io
from pathlib import Path
from PIL import Image
from pdf2image import convert_from_path
import torch

# MinerU client
from mineru_vl_utils import MinerUClient

# Hugging Face model id
MODEL_ID = "opendatalab/MinerU2.5-2509-1.2B"

# Check device
USE_CUDA = torch.cuda.is_available()
print("CUDA available:", USE_CUDA)

CUDA available: True


In [4]:
# Colab cell 3: load model & processor (transformers backend)
# The model card suggests using the mineru-vl-utils wrapper; this matches their example. :contentReference[oaicite:2]{index=2}
from transformers import AutoProcessor
# For newer transformers versions the model class is Qwen2VLForConditionalGeneration
# But mineru-vl-utils can accept a model object or a vllm client; we will load a transformers model (CPU/GPU aware).

# Try to load model; if memory is constrained, consider loading with low_cpu_mem_usage or quantized checkpoints.
try:
    from transformers import Qwen2VLForConditionalGeneration
    model = Qwen2VLForConditionalGeneration.from_pretrained(MODEL_ID, device_map="auto")
except Exception as e:
    print("Auto device_map load failed or not supported, falling back to CPU. Error:", e)
    from transformers import Qwen2VLForConditionalGeneration
    model = Qwen2VLForConditionalGeneration.from_pretrained(MODEL_ID, device_map={"": "cpu"})

processor = AutoProcessor.from_pretrained(MODEL_ID, use_fast=True)

# create the MinerU client using transformers backend
client = MinerUClient(backend="transformers", model=model, processor=processor)
print("Model and client ready.")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json: 0.00B [00:00, ?B/s]

model.safetensors:   0%|          | 0.00/2.31G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/272 [00:00<?, ?B/s]

preprocessor_config.json:   0%|          | 0.00/346 [00:00<?, ?B/s]

tokenizer_config.json: 0.00B [00:00, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

added_tokens.json:   0%|          | 0.00/800 [00:00<?, ?B/s]

special_tokens_map.json: 0.00B [00:00, ?B/s]

chat_template.json: 0.00B [00:00, ?B/s]

Model and client ready.


In [5]:
# Colab cell 4: helper - convert pdf pages to images
def pdf_to_images(pdf_path, dpi=200, output_folder="pdf_images"):
    """
    Convert each PDF page to a PIL.Image and save to output_folder.
    Returns list of PIL.Image objects (in memory) and list of file paths.
    """
    os.makedirs(output_folder, exist_ok=True)
    pil_pages = convert_from_path(pdf_path, dpi=dpi)
    paths = []
    for i, page in enumerate(pil_pages, start=1):
        p = Path(output_folder) / f"page_{i:03d}.png"
        page.save(p, "PNG")
        paths.append(str(p))
    return pil_pages, paths


In [6]:
# Colab cell 5: main function - process PDF and create JSON
def pdf_to_mineru_json(pdf_path, output_json_path="output.json", dpi=200, verbose=True):
    images, image_paths = pdf_to_images(pdf_path, dpi=dpi)
    all_pages = []
    for i, img in enumerate(images, start=1):
        if verbose:
            print(f"Processing page {i}/{len(images)} ...")
        # mineru client expects a PIL.Image
        extracted_blocks = client.two_step_extract(img)  # synchronous API per model card. :contentReference[oaicite:3]{index=3}

        # extracted_blocks is typically a list/dict structure describing layout blocks, text, bbox, type, etc.
        # We'll keep it as-is and add metadata about page index and source image name.
        page_result = {
            "page_index": i,
            "image_path": image_paths[i-1],
            "extracted_blocks": extracted_blocks
        }
        all_pages.append(page_result)

    # Save JSON
    with open(output_json_path, "w", encoding="utf-8") as f:
        json.dump({"pdf": os.path.basename(pdf_path), "pages": all_pages}, f, ensure_ascii=False, indent=2)

    print("Saved JSON to", output_json_path)
    return output_json_path


In [7]:
# Colab cell 6: upload and run (interactive part)
# Use Colab's file upload widget or mount Drive. For small testing, use upload.
from google.colab import files
uploaded = files.upload()  # choose your PDF file from local machine

# take the first uploaded file
pdf_file = list(uploaded.keys())[0]
print("Uploaded:", pdf_file)

# run conversion
out_json = pdf_to_mineru_json(pdf_file, output_json_path="mineru_output.json", dpi=200)
# provide a link to download in Colab
files.download(out_json)


Saving TBS_Handbook-2022.pdf to TBS_Handbook-2022.pdf
Uploaded: TBS_Handbook-2022.pdf
Processing page 1/30 ...


Predict: 100%|██████████| 1/1 [00:18<00:00, 18.38s/it]
Predict: 100%|██████████| 10/10 [00:04<00:00,  2.34it/s]


Processing page 2/30 ...


Predict: 100%|██████████| 1/1 [00:08<00:00,  8.52s/it]
Predict: 100%|██████████| 4/4 [00:05<00:00,  1.27s/it]


Processing page 3/30 ...


Predict: 100%|██████████| 1/1 [01:05<00:00, 65.15s/it]
Predict: 100%|██████████| 48/48 [00:27<00:00,  1.75it/s]


Processing page 4/30 ...


Predict: 100%|██████████| 1/1 [00:21<00:00, 21.64s/it]
Predict: 100%|██████████| 16/16 [00:04<00:00,  3.84it/s]


Processing page 5/30 ...


Predict: 100%|██████████| 1/1 [00:28<00:00, 28.75s/it]
Predict: 100%|██████████| 21/21 [00:25<00:00,  1.20s/it]


Processing page 6/30 ...


Predict: 100%|██████████| 1/1 [00:31<00:00, 31.46s/it]
Predict: 100%|██████████| 23/23 [00:08<00:00,  2.64it/s]


Processing page 7/30 ...


Predict: 100%|██████████| 1/1 [00:33<00:00, 33.72s/it]
Predict: 100%|██████████| 26/26 [00:24<00:00,  1.04it/s]


Processing page 8/30 ...


Predict: 100%|██████████| 1/1 [00:26<00:00, 26.44s/it]
Predict: 100%|██████████| 19/19 [00:32<00:00,  1.73s/it]


Processing page 9/30 ...


Predict: 100%|██████████| 1/1 [00:19<00:00, 19.88s/it]
Predict: 100%|██████████| 15/15 [00:29<00:00,  1.96s/it]


Processing page 10/30 ...


Predict: 100%|██████████| 1/1 [00:17<00:00, 17.15s/it]
Predict: 100%|██████████| 11/11 [00:37<00:00,  3.39s/it]


Processing page 11/30 ...


Predict: 100%|██████████| 1/1 [00:20<00:00, 20.01s/it]
Predict: 100%|██████████| 13/13 [00:30<00:00,  2.32s/it]


Processing page 12/30 ...


Predict: 100%|██████████| 1/1 [00:13<00:00, 13.06s/it]
Predict: 100%|██████████| 8/8 [00:31<00:00,  3.92s/it]


Processing page 13/30 ...


Predict: 100%|██████████| 1/1 [00:20<00:00, 20.18s/it]
Predict: 100%|██████████| 14/14 [00:31<00:00,  2.22s/it]


Processing page 14/30 ...


Predict: 100%|██████████| 1/1 [00:16<00:00, 16.97s/it]
Predict: 100%|██████████| 12/12 [00:30<00:00,  2.58s/it]


Processing page 15/30 ...


Predict: 100%|██████████| 1/1 [00:24<00:00, 24.83s/it]
Predict: 100%|██████████| 18/18 [00:28<00:00,  1.59s/it]


Processing page 16/30 ...


Predict: 100%|██████████| 1/1 [00:17<00:00, 17.37s/it]
Predict: 100%|██████████| 11/11 [00:34<00:00,  3.13s/it]


Processing page 17/30 ...


Predict: 100%|██████████| 1/1 [00:13<00:00, 13.24s/it]
Predict: 100%|██████████| 8/8 [00:25<00:00,  3.18s/it]


Processing page 18/30 ...


Predict: 100%|██████████| 1/1 [00:27<00:00, 27.77s/it]
Predict: 100%|██████████| 20/20 [00:26<00:00,  1.32s/it]


Processing page 19/30 ...


Predict: 100%|██████████| 1/1 [00:16<00:00, 16.12s/it]
Predict: 100%|██████████| 10/10 [00:34<00:00,  3.41s/it]


Processing page 20/30 ...


Predict: 100%|██████████| 1/1 [00:13<00:00, 13.57s/it]
Predict: 100%|██████████| 9/9 [00:18<00:00,  2.04s/it]


Processing page 21/30 ...


Predict: 100%|██████████| 1/1 [00:20<00:00, 20.63s/it]
Predict: 100%|██████████| 14/14 [00:34<00:00,  2.43s/it]


Processing page 22/30 ...


Predict: 100%|██████████| 1/1 [00:25<00:00, 25.79s/it]
Predict: 100%|██████████| 19/19 [00:28<00:00,  1.51s/it]


Processing page 23/30 ...


Predict: 100%|██████████| 1/1 [00:37<00:00, 37.53s/it]
Predict: 100%|██████████| 28/28 [00:42<00:00,  1.51s/it]


Processing page 24/30 ...


Predict: 100%|██████████| 1/1 [00:40<00:00, 40.39s/it]
Predict: 100%|██████████| 31/31 [00:44<00:00,  1.43s/it]


Processing page 25/30 ...


Predict: 100%|██████████| 1/1 [00:23<00:00, 23.09s/it]
Predict: 100%|██████████| 18/18 [00:34<00:00,  1.91s/it]


Processing page 26/30 ...


Predict: 100%|██████████| 1/1 [00:24<00:00, 24.08s/it]
Predict: 100%|██████████| 19/19 [00:29<00:00,  1.57s/it]


Processing page 27/30 ...


Predict: 100%|██████████| 1/1 [00:21<00:00, 21.41s/it]
Predict: 100%|██████████| 15/15 [00:24<00:00,  1.61s/it]


Processing page 28/30 ...


Predict: 100%|██████████| 1/1 [00:07<00:00,  7.74s/it]
Predict: 100%|██████████| 3/3 [00:02<00:00,  1.40it/s]


Processing page 29/30 ...


Predict: 100%|██████████| 1/1 [00:23<00:00, 23.95s/it]
Predict: 100%|██████████| 18/18 [00:17<00:00,  1.02it/s]


Processing page 30/30 ...


Predict: 100%|██████████| 1/1 [00:07<00:00,  7.70s/it]
Predict: 100%|██████████| 3/3 [00:01<00:00,  1.64it/s]

Saved JSON to mineru_output.json





<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>