<a href="https://colab.research.google.com/github/ngusadeep/CRUD-springboot/blob/main/docs_parser_with_deepseek_ocr_3b_model3.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Install Ollama
!curl -fsSL https://ollama.com/install.sh | sh

# Start Ollama server in background
!nohup ollama serve > /content/ollama.log 2>&1 &

# (Optional) List models / check status
!ollama --version


In [None]:
!ollama pull deepseek-ocr:3b
!ollama list

In [None]:
import os, json, tempfile, base64, csv
from pdf2image import convert_from_path
from PIL import Image
from google.colab import files
import subprocess

In [None]:
def upload_file():
    up = files.upload()
    fname = list(up.keys())[0]
    return fname

def pdf_to_images(pdf_path):
    pages = convert_from_path(pdf_path, dpi=300)
    img_paths = []
    base = os.path.splitext(os.path.basename(pdf_path))[0]
    os.makedirs(base, exist_ok=True)
    for i, p in enumerate(pages):
        img_path = os.path.join(base, f"page_{i+1}.png")
        p.save(img_path, "PNG")
        img_paths.append(img_path)
    return img_paths


In [None]:
def call_ollama(image_path, system_prompt, user_prompt):
    # Prepare arguments, using ollama CLI
    cmd = [
        "ollama", "run", "deepseek-ocr:3b",
        "--prompt", json.dumps({
            "system": system_prompt,
            "user": user_prompt
        }),
        "--image", image_path,
        "--no-stream"
    ]
    result = subprocess.run(cmd, capture_output=True, text=True, timeout=120)
    return result.stdout

SYSTEM_PROMPT = """
You are a precise OCR + document‑parser AI that extracts structured shipment details from scanned container shipment documents.
Extract ONLY the fields:
container_terminal, shipment_date (YYYY‑MM‑DD), shipment_number, container_number, container_size.
Return strictly a JSON object. Use null if a field is missing.
"""


In [None]:
fname = upload_file()

if fname.lower().endswith(".pdf"):
    images = pdf_to_images(fname)
else:
    images = [fname]

rows = []
for i, img in enumerate(images, start=1):
    print("Processing", img)
    raw = call_ollama(img, SYSTEM_PROMPT, "Extract the required fields.")
    try:
        data = json.loads(raw)
    except:
        data = {
            "container_terminal": None,
            "shipment_date": None,
            "shipment_number": None,
            "container_number": None,
            "container_size": None
        }
    data.update({
        "filename": fname,
        "page_number": i
    })
    rows.append(data)

csv_name = fname + "_extracted.csv"
with open(csv_name, "w", newline="", encoding="utf-8") as f:
    writer = csv.DictWriter(f, fieldnames=["filename","page_number",
                                           "container_terminal","shipment_date",
                                           "shipment_number","container_number","container_size"])
    writer.writeheader()
    writer.writerows(rows)

print("✅ Saved:", csv_name)
files.download(csv_name)
