In [16]:
!pip install streamlit python-dotenv transformers huggingface-hub pymupdf

Collecting python-dotenv
  Downloading python_dotenv-1.1.0-py3-none-any.whl.metadata (24 kB)
Collecting pymupdf
  Downloading pymupdf-1.26.0-cp39-abi3-manylinux2014_x86_64.manylinux_2_17_x86_64.whl.metadata (3.4 kB)
Downloading python_dotenv-1.1.0-py3-none-any.whl (20 kB)
Downloading pymupdf-1.26.0-cp39-abi3-manylinux2014_x86_64.manylinux_2_17_x86_64.whl (24.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m24.1/24.1 MB[0m [31m91.1 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: python-dotenv, pymupdf
Successfully installed pymupdf-1.26.0 python-dotenv-1.1.0


In [17]:
import streamlit as st
import os
import time
import torch
import tempfile
from PIL import Image
from dotenv import load_dotenv
import fitz

In [18]:
# Load environment variables
load_dotenv()
HF_TOKEN = os.getenv("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")

# Check for dependencies
try:
    from transformers import AutoProcessor, AutoModelForVision2Seq
    from huggingface_hub import login
    transformers_available = True
except ImportError:
    transformers_available = False

In [19]:
def check_dependencies():
    missing = []
    if not transformers_available:
        missing.append("transformers huggingface_hub")
    return missing


def process_single_image(image, prompt_text="Convert this page to docling."):
    """Process a single image and return only raw OCR text"""
    if HF_TOKEN:
        login(token=HF_TOKEN)

    device = "cuda" if torch.cuda.is_available() else "cpu"
    start_time = time.time()

    try:
        processor = AutoProcessor.from_pretrained("ds4sd/SmolDocling-256M-preview")
        model = AutoModelForVision2Seq.from_pretrained(
            "ds4sd/SmolDocling-256M-preview",
            torch_dtype=torch.float32,
        ).to(device)
    except Exception as e:
        st.error(f"Error loading model: {str(e)}")
        raise


     # Build prompt for raw OCR
    messages = [
        {
            "role": "user",
            "content": [
                {"type": "image"},
                {"type": "text", "text": prompt_text}
            ]
        },
    ]

    prompt = processor.apply_chat_template(messages, add_generation_prompt=True)
    inputs = processor(text=prompt, images=[image], return_tensors="pt").to(device)

    generated_ids = model.generate(**inputs, max_new_tokens=1024)
    prompt_length = inputs.input_ids.shape[1]
    trimmed_generated_ids = generated_ids[:, prompt_length:]

    raw_text = processor.batch_decode(trimmed_generated_ids, skip_special_tokens=True)[0].strip()
    processing_time = time.time() - start_time

    return raw_text, processing_time

In [20]:
def process_pdf(pdf_file, prompt_text="Convert this page to docling."):
    """Extract raw OCR text from all pages in a PDF"""
    temp_file = tempfile.NamedTemporaryFile(delete=False)
    temp_file.write(pdf_file.read())
    temp_file.close()

    doc = fitz.open(temp_file.name)

    all_raw_text = []
    total_processing_time = 0

    for page_num in range(len(doc)):
        page = doc.load_page(page_num)
        pix = page.get_pixmap()
        image = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)

        raw_text, processing_time = process_single_image(image, prompt_text)
        all_raw_text.append(f"--- Page {page_num + 1} ---\n{raw_text}")
        total_processing_time += processing_time

    combined_text = "\n\n".join(all_raw_text)
    return combined_text, total_processing_time

In [21]:
def main():
    st.set_page_config(page_title="OCR Text Extractor", layout="wide")
    st.title("🧾 OCR Text Extractor (Image & PDF)")

    st.write("Upload an image or PDF receipt to extract raw OCR text using SmolDocling.")

    if not HF_TOKEN:
        st.warning("⚠️ HF_TOKEN not found in .env file. Authentication may fail.")

    missing_deps = check_dependencies()
    if missing_deps:
        st.error(f"Missing dependencies: {', '.join(missing_deps)}. Please install them.")
        st.info("Install with: pip install " + " ".join(missing_deps))
        st.stop()

    with st.sidebar:
        st.header("📎 Upload Input")
        upload_option = st.radio("Choose file type:", ["Single Image", "PDF File"])
        prompt_text = st.text_input("Prompt for OCR (default recommended)", "Convert this page to docling.")

        if upload_option == "Single Image":
            uploaded_file = st.file_uploader("Upload an image", type=["jpg", "jpeg", "png"])
        else:
            uploaded_pdf = st.file_uploader("Upload a PDF", type=["pdf"])

    if upload_option == "Single Image" and uploaded_file is not None:
        image = Image.open(uploaded_file).convert("RGB")
        st.image(image, caption="Uploaded Image", width=300)

        if st.button("Process Image"):
            with st.spinner("Processing image..."):
                try:
                    raw_text, processing_time = process_single_image(image, prompt_text)
                    st.subheader("📝 Extracted Text")
                    st.text_area("OCR Result", raw_text, height=400)
                    st.download_button("Download Text", raw_text, file_name="ocr_output.txt")
                    st.success(f"Processing completed in {processing_time:.2f} seconds")
                except Exception as e:
                    st.error(f"Error: {str(e)}")

    elif upload_option == "PDF File" and uploaded_pdf is not None:
        if st.button("Process PDF"):
            with st.spinner("Processing PDF..."):
                try:
                    combined_text, total_processing_time = process_pdf(uploaded_pdf, prompt_text)
                    st.subheader("📄 Extracted Text from PDF")
                    st.text_area("OCR Result", combined_text, height=400)
                    st.download_button("Download Text", combined_text, file_name="ocr_pdf_output.txt")
                    st.success(f"PDF processed in {total_processing_time:.2f} seconds")
                except Exception as e:
                    st.error(f"Error: {str(e)}")

    with st.expander("ℹ️ About"):
        st.write("""
            This tool uses the [SmolDocling](https://huggingface.co/ds4sd/SmolDocling-256M-preview) model from Hugging Face to perform OCR on uploaded images or PDFs.

            - Only **raw text** is extracted (no layout/formatting).
            - Useful for extracting receipt text, invoices, or any scanned document.
        """)


if __name__ == "__main__":
    main()

2025-06-01 07:01:21.343 
  command:

    streamlit run /usr/local/lib/python3.11/dist-packages/colab_kernel_launcher.py [ARGUMENTS]
2025-06-01 07:01:21.359 Session state does not function when running a script without `streamlit run`


In [23]:
!streamlit run /usr/local/lib/python3.11/dist-packages/colab_kernel_launcher.py


Collecting usage statistics. To deactivate, set browser.gatherUsageStats to false.
[0m
[0m
[34m[1m  You can now view your Streamlit app in your browser.[0m
[0m
[34m  Local URL: [0m[1mhttp://localhost:8501[0m
[34m  Network URL: [0m[1mhttp://172.28.0.12:8501[0m
[34m  External URL: [0m[1mhttp://34.169.231.248:8501[0m
[0m
[34m  Stopping...[0m
[34m  Stopping...[0m
