In [1]:
# =========================
# 1) GPU + deps
# =========================
!nvidia-smi
!pip -q install --upgrade pip
!pip -q install torch==2.6.0 torchvision==0.21.0 torchaudio==2.6.0 --index-url https://download.pytorch.org/whl/cu118
!pip -q install transformers==4.46.3 tokenizers==0.20.3 einops addict easydict
!pip -q install flash-attn==2.7.3 --no-build-isolation || echo "flash-attn not available; falling back to eager attention"
!apt-get -y update >/dev/null 2>&1 && apt-get -y install poppler-utils >/dev/null 2>&1
!pip -q install pdf2image pillow bs4 pandas tabulate

# (Optional) HF login if you hit rate limits:
# from huggingface_hub import login; login()

Sat Oct 25 03:19:36 2025       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 550.54.15              Driver Version: 550.54.15      CUDA Version: 12.4     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  NVIDIA L4                      Off |   00000000:00:03.0 Off |                    0 |
| N/A   44C    P8             12W /   72W |       0MiB /  23034MiB |      0%      Default |
|                                         |                        |                  N/A |
+-----------------------------------------+------------------------+----------------------+
                                                

In [2]:
# =========================
# 2) Upload a PDF
# =========================
from google.colab import files
up = files.upload()
assert len(up) == 1, "Upload exactly one PDF"
pdf_path = list(up.keys())[0]
print("PDF:", pdf_path)


Saving NIPS-2017-attention-is-all-you-need-Paper.pdf to NIPS-2017-attention-is-all-you-need-Paper.pdf
PDF: NIPS-2017-attention-is-all-you-need-Paper.pdf


In [3]:
# =========================
# 3) PDF -> images  (into ./dpsk_ocr_outputs/pages)
# =========================
from pdf2image import convert_from_path
from pathlib import Path
import os

out_dir = Path("dpsk_ocr_outputs")
img_dir = out_dir / "pages"
md_dir  = out_dir / "markdown_pages"
os.makedirs(img_dir, exist_ok=True)
os.makedirs(md_dir,  exist_ok=True)

IM_DPI = 180                   # try 180; 160–200 is a good range
images = convert_from_path(pdf_path, dpi=IM_DPI, fmt="png",
                           output_folder=str(img_dir),
                           output_file="page",
                           paths_only=True)
images = sorted(images)
print(f"Extracted {len(images)} page image(s) → {img_dir}")

Extracted 11 page image(s) → dpsk_ocr_outputs/pages


In [4]:
# =========================
# 4) Load DeepSeek-OCR (Transformers)
# =========================
import torch, os
from transformers import AutoModel, AutoTokenizer, GenerationConfig

os.environ["CUDA_VISIBLE_DEVICES"] = "0"
torch.backends.cuda.matmul.allow_tf32 = True
try:
    torch.set_float32_matmul_precision("high")
except Exception:
    pass

model_id = "deepseek-ai/DeepSeek-OCR"

# try flash-attn; fallback to eager
attn_impl = "flash_attention_2"
try:
    import flash_attn  # noqa
except Exception:
    attn_impl = "eager"

tok = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
if tok.pad_token is None and tok.eos_token is not None:
    tok.pad_token = tok.eos_token

model = AutoModel.from_pretrained(
    model_id,
    trust_remote_code=True,
    _attn_implementation=attn_impl,
    torch_dtype=torch.bfloat16,
    use_safetensors=True,
).to("cuda").eval()

# Generation caps (avoid long loops)
if not hasattr(model, "generation_config") or model.generation_config is None:
    model.generation_config = GenerationConfig()
gc = model.generation_config
gc.pad_token_id = tok.pad_token_id
gc.eos_token_id = tok.eos_token_id
gc.do_sample = False; gc.num_beams = 1; gc.top_p = 1.0; gc.temperature = None
gc.max_new_tokens = 512            # reduce if a page still runs long; increase if you see truncation
gc.no_repeat_ngram_size = 6
gc.repetition_penalty = 1.15
gc.length_penalty = 0.9
model.generation_config = gc

print("Loaded:", model_id, "| attention:", attn_impl, "| dtype:", next(model.parameters()).dtype)



The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/801 [00:00<?, ?B/s]

config.json: 0.00B [00:00, ?B/s]

modeling_deepseekocr.py: 0.00B [00:00, ?B/s]

conversation.py: 0.00B [00:00, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/deepseek-ai/DeepSeek-OCR:
- conversation.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.


deepencoder.py: 0.00B [00:00, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/deepseek-ai/DeepSeek-OCR:
- deepencoder.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.


modeling_deepseekv2.py: 0.00B [00:00, ?B/s]

configuration_deepseek_v2.py: 0.00B [00:00, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/deepseek-ai/DeepSeek-OCR:
- configuration_deepseek_v2.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.
A new version of the following files was downloaded from https://huggingface.co/deepseek-ai/DeepSeek-OCR:
- modeling_deepseekv2.py
- configuration_deepseek_v2.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.
A new version of the following files was downloaded from https://huggingface.co/deepseek-ai/DeepSeek-OCR:
- modeling_deepseekocr.py
- conversation.py
- deepencoder.py
- modeling_deepseekv2.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.
You are using a model of type deepseek_vl_v2 to instantiate

model.safetensors.index.json: 0.00B [00:00, ?B/s]

Downloading shards:   0%|          | 0/1 [00:00<?, ?it/s]

model-00001-of-000001.safetensors:   0%|          | 0.00/6.67G [00:00<?, ?B/s]

You are attempting to use Flash Attention 2.0 with a model not initialized on GPU. Make sure to move the model to GPU after initializing it on CPU with `model.to('cuda')`.
Some weights of DeepseekOCRForCausalLM were not initialized from the model checkpoint at deepseek-ai/DeepSeek-OCR and are newly initialized: ['model.vision_model.embeddings.position_ids']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Loaded: deepseek-ai/DeepSeek-OCR | attention: flash_attention_2 | dtype: torch.bfloat16


In [5]:
# =========================
# 5) Cleaner (strip detection tags; collapse duplicates)
# =========================
import re
REF_DET_RE = re.compile(r"<\|(?:ref|det)\|>.*?<\|/(?:ref|det)\|>", re.DOTALL)
def clean_ocr_markdown(s: str) -> str:
    s = REF_DET_RE.sub("", s or "")
    lines, out, last, run = s.splitlines(), [], None, 0
    for line in lines:
        if line == last:
            run += 1
            if run < 4:
                out.append(line)
        else:
            last, run = line, 1
            out.append(line)
    return "\n".join(out).strip()


In [6]:
# =========================
# 6) Robust page inference (fast pass + retry)
# =========================
from datetime import datetime
import time, csv
from pathlib import Path

prompt = (
    "<image>\n"
    "<|grounding|>Convert the document to clean, concise Markdown.\n"
    "- Use Markdown tables (not HTML) for tabular data.\n"
    "- Do not include detection tags like <|ref|> or <|det|>.\n"
)

BASE_SIZE = 1024

def infer_once(img_path:str, image_size:int, crop_mode:bool, save_results:bool):
    """Wrap model.infer with our defaults; returns (text, raw_text)"""
    try:
        txt = model.infer(
            tok,
            prompt=prompt,
            image_file=str(img_path),
            output_path=str(md_dir),
            base_size=BASE_SIZE,
            image_size=image_size,
            crop_mode=crop_mode,
            save_results=save_results,
            test_compress=False,
            eval_mode=True
        )
        return (txt or "").strip()
    except Exception as e:
        return f"[ERROR: {e}]"

def infer_with_retry(img_path:str):
    # 1) fast path: 512 + no crop
    t0 = time.time()
    txt = infer_once(img_path, image_size=512, crop_mode=False, save_results=False)
    if len(txt) >= 40:   # looks like real text
        return txt, time.time()-t0, "512/no-crop"

    # 2) retry: 640 + crop (helps dense tables / small fonts)
    txt2 = infer_once(img_path, image_size=640, crop_mode=True, save_results=True)
    if len(txt2) >= 40:
        return txt2, time.time()-t0, "640/crop"

    # 3) last resort: if helper wrote result.mmd, read it
    mm = md_dir / "result.mmd"
    if mm.exists():
        try:
            txt3 = mm.read_text(encoding="utf-8").strip()
            if len(txt3) >= 10:
                return txt3, time.time()-t0, "640/crop+file"
        except Exception:
            pass

    # give back whatever we have (may be empty or error)
    return (txt2 or txt), time.time()-t0, "fallback"



In [7]:
# =========================
# 7) Result analysis and persisting
# =========================

combined_md_path = out_dir / (Path(pdf_path).stem + "_combined.md")
timing_csv_path  = out_dir / "timings.csv"

start_dt = datetime.now(); t_all = time.time()
print("Start:", start_dt.isoformat(), "| pages:", len(images))

with open(combined_md_path, "w", encoding="utf-8") as mdout, \
     open(timing_csv_path, "w", newline="", encoding="utf-8") as csvf:

    writer = csv.writer(csvf)
    writer.writerow(["page_index","image_name","seconds","mode","chars"])

    for i, img_path in enumerate(images, 1):
        txt, secs, mode = infer_with_retry(img_path)
        cleaned = clean_ocr_markdown(txt)
        # write per-page md
        per_page_md = md_dir / f"page_{i:04d}.md"
        with open(per_page_md, "w", encoding="utf-8") as f:
            f.write((cleaned or "[EMPTY OCR OUTPUT]") + "\n")

        # append to combined
        mdout.write(f"\n\n<!-- Page {i} -->\n\n{cleaned or '[EMPTY OCR OUTPUT]'}\n")
        writer.writerow([i, Path(img_path).name, f"{secs:.2f}", mode, len(cleaned)])
        print(f"Page {i}/{len(images)} | {secs:.2f}s | mode={mode} | chars={len(cleaned)}")

end_dt = datetime.now(); elapsed = time.time()-t_all
print("End:  ", end_dt.isoformat())
print(f"Total time: {elapsed:.2f} s for {len(images)} pages")
print("Per-page MDs:", md_dir)
print("Combined MD:", combined_md_path)
print("Timings CSV:", timing_csv_path)

Start: 2025-10-25T03:25:41.682829 | pages: 11
directly resize


The `seen_tokens` attribute is deprecated and will be removed in v4.41. Use the `cache_position` model input instead.
`get_max_cache()` is deprecated for all Cache classes. Use `get_max_cache_shape()` instead. Calling `get_max_cache()` will raise error from v4.48
The attention layers in this model are transitioning from computing the RoPE embeddings internally through `position_ids` (2D tensor with the indexes of the tokens), to using externally computed `position_embeddings` (Tuple of tensors, containing cos and sin). In v4.46 `position_ids` will be removed and `position_embeddings` will be mandatory.


BASE:  torch.Size([1, 64, 1280])
NO PATCHES
Page 1/11 | 27.87s | mode=512/no-crop | chars=2065
directly resize
BASE:  torch.Size([1, 64, 1280])
NO PATCHES
Page 2/11 | 40.26s | mode=512/no-crop | chars=4436
directly resize
BASE:  torch.Size([1, 64, 1280])
NO PATCHES
Page 3/11 | 20.78s | mode=512/no-crop | chars=1883
directly resize
BASE:  torch.Size([1, 64, 1280])
NO PATCHES
Page 4/11 | 25.95s | mode=512/no-crop | chars=2433
directly resize
BASE:  torch.Size([1, 64, 1280])
NO PATCHES
Page 5/11 | 39.14s | mode=512/no-crop | chars=3570
directly resize
BASE:  torch.Size([1, 64, 1280])
NO PATCHES
Page 6/11 | 28.19s | mode=512/no-crop | chars=3422
directly resize
BASE:  torch.Size([1, 64, 1280])
NO PATCHES
Page 7/11 | 40.62s | mode=512/no-crop | chars=3528
directly resize
BASE:  torch.Size([1, 64, 1280])
NO PATCHES
Page 8/11 | 47.47s | mode=512/no-crop | chars=3979
directly resize
BASE:  torch.Size([1, 64, 1280])
NO PATCHES
Page 9/11 | 43.66s | mode=512/no-crop | chars=4372
directly resize
B

In [8]:
# =========================
# 8) Make a ZIP of everything and offer download
# =========================
import shutil
zip_path = shutil.make_archive("deepseek_ocr_outputs", "zip", root_dir=str(out_dir))
print("ZIP:", zip_path)

from google.colab import files
files.download(zip_path)

ZIP: /content/deepseek_ocr_outputs.zip


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>