<a href="https://colab.research.google.com/github/ngusadeep/CRUD-springboot/blob/main/qwen2_5vl_7.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
%%capture
%pip install ollama pdf2image pydantic
%pip install colab-xterm
!sudo apt-get update
!sudo apt-get install -y pciutils lshw poppler-utils

In [None]:
!curl -fsSL https://ollama.com/install.sh | sh

In [None]:
import subprocess
import requests
import json
import threading
from pprint import pprint
def run_ollama():
    subprocess.Popen(["ollama", "serve"])
thread = threading.Thread(target=run_ollama)
thread.start()

In [None]:
!ollama pull qwen2.5vl:7b
!ollama list
!curl http://localhost:11434/v1/models/qwen2.5vl:7b

In [None]:
import os
import re
import csv
import time
import json
import base64
import random
from pathlib import Path
from pdf2image import convert_from_path
from pydantic import BaseModel, ValidationError
from google.colab import files
from PIL import Image
import requests

OLLAMA_URL = "http://127.0.0.1:11434/api/generate"

In [None]:
def upload_file():
    uploads = files.upload()
    original_filename = list(uploads.keys())[0]

    # normalize filename
    cleaned_filename = original_filename.lower().replace(" ", "_")
    cleaned_filename = re.sub(r"[^a-z0-9_.-]", "", cleaned_filename)

    # handle duplicate filenames safely
    base, ext = os.path.splitext(cleaned_filename)
    counter = 1
    final_filename = cleaned_filename

    while os.path.exists(final_filename):
        final_filename = f"{base}_{counter}{ext}"
        counter += 1

    # rename file on disk if needed
    if final_filename != original_filename:
        os.rename(original_filename, final_filename)

    print(f"‚úÖ Uploaded: {final_filename}")
    return final_filename

In [None]:
def is_image_file(filename):
    return filename.lower().endswith(('.png' , '.jpeg' , '.jpg'))

In [None]:
def extract_pdf_pages_as_images(pdf_path):
    # base filename (without extension)
    base_name = os.path.splitext(os.path.basename(pdf_path))[0]

    # create a folder
    output_dir = base_name
    os.makedirs(output_dir, exist_ok=True)

    # convert PDF pages to images
    pages = convert_from_path(pdf_path)
    image_paths = []

    for i, page in enumerate(pages):
        img_path = os.path.join(output_dir, f"page{i+1}.png")
        page.save(img_path, 'PNG')
        image_paths.append(img_path)

    print(f"üìÑ Extracted {len(image_paths)} pages from '{pdf_path}'")
    return image_paths

In [None]:
system_prompt = (
        "You are a precise and reliable OCR + document parser AI that extracts "
        "structured shipment details from scanned container shipments documents, primarily "
        "equipment interchange reports (EIRs) and export container orders for empty containers "
        "being gated out from terminals in Tanzania.\n\n"
        "Your job is to look at the uploaded image and extract ONLY these fields:\n"
        "- container_terminal\n"
        "- shipment_date (in YYYY-MM-DD format)\n"
        "- shipment_number\n"
        "- container_number\n"
        "- container_size (Should be either 20 or 40 only)\n\n"
        "Mappings:\n"
        "Use this mapping below to know where to get each item above on each terminal document.\n\n"
        "APM TERMINALS\n"
        "- container_terminal >> APM TERMINALS\n"
        "- shipment_number >> EIR No\n"
        "- shipment_date >> Gate Out Date\n"
        "- container_number >> Container No\n"
        "- container_size >> Size/Type\n"
        "FANTUZZI INVESTMENTS LTD\n"
        "- container_teminal >> FANTUZZI INVESTMENT LTD\n"
        "- shipment_number >> Request No.\n"
        "- shipment_date >> Date out\n"
        "- container_number >> Container No.\n"
        "- container_size >> Type\n"
        "KURASINI CONTAINER TERMINAL LTD\n"
        "- container_terminal >> KURASINI CONTAINER TERMINAL LTD\n"
        "shipment_number >> KCT/OUT/\n"
        "shipment_date >>  DATED\n"
        "container_number >> CONTAINER NUMBER\n"
        "container_size >> LENGTH\n"
        "ORION TRANSPORT (T) LTD\n"
        "container_terminal >> ORION TRANSPORT (T) LTD\n"
        "shipment_number >> Outward No\n"
        "shipment_date >> Date out\n"
        "container_number >> Container #\n"
        "container_size >> Container size\n"
        "TAZAMA PIPELINES LTD\n"
        "container_terminal  >> TAZAMA PIPELINES LTD\n"
        "shipment_number >> Bill of Lading\n"
        "shipment_date >> Interchange Date\n"
        "container_number  >> Container No\n"
        "container_size >> Container Size\n\n"
        "Rules:\n"
        "- Container size MUST be extracted as an integer, either 20 or 40. If the document shows a value like '22G1', extract the numerical part '20'. If '42G1', extract '40'.\n"
        "- Respond ONLY with a valid JSON object. No explanations, no text before or after.\n"
        "- If a field is missing or unclear, output it as null.\n"
        "- Be tolerant to printed partially obscured data.\n"
    )

user_prompt = (
        "Extract the container terminal, shipment date, shipment number, container number and container size from this document image. "
        "Return strictly in JSON format."
    )

class ShipmentData(BaseModel):
    container_terminal: str
    shipment_date: str
    shipment_number: str
    container_number: str
    container_size: int

In [None]:
def process_image(image_path, model="qwen2.5vl:7b"):
    # Read image
    with open(image_path, "rb") as img:
        img_b64 = base64.b64encode(img.read()).decode()

    # Build Ollama payload
    payload = {
        "model": model,
        "prompt": system_prompt + "\n\n" + user_prompt,
        "images": [img_b64],
        "stream": False
    }

    # Send to Ollama
    res = requests.post(OLLAMA_URL, json=payload)

    # Check for successful response and 'response' key
    try:
        response_json = res.json()
        if "response" not in response_json:
            print("‚ùå Ollama API response did not contain 'response' key.")
            print("Full Ollama API response:")
            pprint(response_json)
            raise KeyError("Missing 'response' key in Ollama API output")
        data = response_json["response"]

        # Strip markdown code block wrappers if present
        if data.strip().startswith('```json') and data.strip().endswith('```'):
            data = data.strip()[len('```json'):-len('```')].strip()

    except requests.exceptions.JSONDecodeError as e:
        print(f"‚ùå Failed to decode JSON from Ollama API: {e}")
        print("Raw Ollama API response:")
        print(res.text)
        raise e
    except KeyError as e:
        print(f"‚ùå KeyError during Ollama response parsing: {e}")
        raise e

    # Parse into Pydantic model
    try:
        parsed = ShipmentData.model_validate_json(data)
    except Exception as e:
        print("‚ùå JSON parse error:", e)
        print("Raw model output:")
        print(data)
        raise e

    # No cost in Ollama ‚Üí return None or custom info

    return parsed

In [None]:
def main():
    """
    Handles upload of either an image or a PDF.
    - For image: extracts data from it.
    - For PDF: splits into pages, processes each page.
    - Writes results to a CSV file
    """

    filename = upload_file()

    start_time = time.time()
    print("\n‚è≥ Processing...")

    # Define output CSV filename
    base_name = os.path.splitext(filename)[0]
    csv_filename = f"{base_name}_extracted.csv"

    # Detect file type and extract page images if needed
    if is_image_file(filename):
        image_paths = [filename]
    elif filename.lower().endswith(".pdf"):
        image_paths = extract_pdf_pages_as_images(filename)
    else:
        raise ValueError("Unsupported file type. Only PDF, JPG, or PNG are allowed.")

    # Define CSV columns
    csv_columns = ["file", "page_number", "container_terminal", "shipment_date", "shipment_number",  "container_number", "container_size"]
    all_data = []

    # Process each image
    for i, img_path in enumerate(image_paths):
        print(f"‚û°Ô∏è Processing page {i+1}/{len(image_paths)}: {img_path}")
        data = process_image(img_path)
        all_data.append({
                "file": os.path.basename(filename),
                "page_number": i + 1,
                "container_terminal": data.container_terminal,
                "shipment_date": data.shipment_date,
                "shipment_number": data.shipment_number,
                "container_number": data.container_number,
                "container_size": data.container_size,
            })


    # Calculate total time
    end_time = time.time()
    total_time = end_time - start_time

    print(f"‚úÖ Processing done.\n")
    print(f"‚è±Ô∏è Total time: {total_time:.2f} seconds")

    # Save results to CSV
    if all_data:
        with open(csv_filename, "w", newline="", encoding="utf-8") as csvfile:
            writer = csv.DictWriter(csvfile, fieldnames=csv_columns)
            writer.writeheader()
            writer.writerows(all_data)

        print(f"\n‚úÖ Data saved to: {csv_filename}")
        files.download(csv_filename)
    else:
        print("‚ö†Ô∏è No valid data extracted.")

In [None]:
main()