<a href="https://colab.research.google.com/github/mithunkumarsr/LearnCSWithMithun/blob/master/InvoiceQRCodeCopier.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
#author: Mithun Kumar S R
#date 22Apr2025
# Install PyMuPDF, pyzbar, and system dependency zbar-tools
!pip install pymupdf pyzbar
!apt-get update && apt-get install -y zbar-tools

import fitz  # PyMuPDF
import os
import re
import io
from PIL import Image # Needed for pyzbar interaction
from pyzbar import pyzbar # Import pyzbar
import traceback # For detailed error printing

Hit:1 https://cloud.r-project.org/bin/linux/ubuntu jammy-cran40/ InRelease
Hit:2 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64  InRelease
Hit:3 http://security.ubuntu.com/ubuntu jammy-security InRelease
Hit:4 http://archive.ubuntu.com/ubuntu jammy InRelease
Hit:5 https://r2u.stat.illinois.edu/ubuntu jammy InRelease
Hit:6 http://archive.ubuntu.com/ubuntu jammy-updates InRelease
Hit:7 http://archive.ubuntu.com/ubuntu jammy-backports InRelease
Hit:8 https://ppa.launchpadcontent.net/deadsnakes/ppa/ubuntu jammy InRelease
Hit:9 https://ppa.launchpadcontent.net/graphics-drivers/ppa/ubuntu jammy InRelease
Hit:10 https://ppa.launchpadcontent.net/ubuntugis/ppa/ubuntu jammy InRelease
Reading package lists... Done
W: Skipping acquire of configured file 'main/source/Sources' as repository 'https://r2u.stat.illinois.edu/ubuntu jammy InRelease' does not seem to provide it (sources.list entry misspelt?)
Reading package lists... Done
Building dependency tree... Done
Reading

In [3]:
def extract_invoice_number(pdf_path):
    """Extracts the invoice number (e.g., IN002025725) from a PDF."""
    try:
        doc = fitz.open(pdf_path)
        invoice_number = None
        # Regex looks for "Invoice" (case-insensitive) followed by optional
        # whitespace/colon/hash and then "IN" followed by digits. Adjust if needed.
        pattern = re.compile(r"Invoice\s*[:#]?\s*(IN\d+)", re.IGNORECASE)

        for page_num in range(len(doc)):
            page = doc.load_page(page_num)
            text = page.get_text("text")
            match = pattern.search(text)
            if match:
                invoice_number = match.group(1)
                break # Found it, stop searching
        doc.close()
        return invoice_number
    except Exception as e:
        print(f"  - Error reading invoice number from {pdf_path}: {e}")
        return None

In [4]:
def find_qr_page_and_image(shell_pdf_path, invoice_number):
    """
    Finds the page for a given invoice number in the shell PDF, renders it,
    and uses pyzbar to find and decode QR codes. Extracts the image data
    for the first QR code found in the top-right by matching its location
    to image objects on the page, falling back to rendering if needed.
    """
    try:
        shell_doc = fitz.open(shell_pdf_path)
        qr_image_bytes = None
        qr_page_num = -1
        target_page = None
        qr_pdf_bbox = None # Store the bbox found by pyzbar

        # First, find the correct page
        for page_num in range(len(shell_doc)):
            page = shell_doc.load_page(page_num)
            text = page.get_text("text")
            if f"Document No.: {invoice_number}" in text or invoice_number in text:
                 qr_page_num = page_num
                 target_page = page
                 print(f"  - Found text match on page {qr_page_num + 1}")
                 break # Found page

        if target_page:
            page_rect = target_page.rect
            # print(f"  - Processing Page {qr_page_num + 1} with pyzbar. Page Rect: {page_rect}")

            # Get all image objects on the page *once*
            all_image_objects = target_page.get_images(full=True)
            # print(f"  - Found {len(all_image_objects)} image objects using get_images().")

            # Render page to a pixmap
            zoom = 2 # Increase if QR detection is poor, decrease for speed
            mat = fitz.Matrix(zoom, zoom)
            pix = target_page.get_pixmap(matrix=mat)
            img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)

            # Decode QR codes using pyzbar
            decoded_qrs = pyzbar.decode(img)
            # print(f"  - pyzbar found {len(decoded_qrs)} potential barcodes/QR codes.")

            if decoded_qrs:
                found_qr = False
                for qr in decoded_qrs:
                    if qr.type != 'QRCODE': # Focus only on QR codes
                        continue

                    qr_x, qr_y, qr_w, qr_h = qr.rect
                    # Convert pixel coordinates back to PDF points
                    pt_x0 = qr_x / zoom; pt_y0 = qr_y / zoom
                    pt_x1 = (qr_x + qr_w) / zoom; pt_y1 = (qr_y + qr_h) / zoom
                    qr_pdf_bbox = fitz.Rect(pt_x0, pt_y0, pt_x1, pt_y1)

                    # --- Positional Check (Top-Right using PDF coordinates) ---
                    # Checks if QR code *starts* in the top quarter and right half
                    is_top = pt_y0 < page_rect.height / 4
                    is_right = pt_x0 > page_rect.width / 2

                    if is_top and is_right:
                        print(f"  - Found potential QR Code in Top-Right via pyzbar. PDF BBox: {qr_pdf_bbox}")
                        found_qr = True
                        matching_img_xref = None
                        qr_center_point = qr_pdf_bbox.tl + (qr_pdf_bbox.width / 2, qr_pdf_bbox.height / 2)

                        # Try to find a matching image object from the PDF's internal list
                        for img_info in all_image_objects:
                            try:
                                img_bbox = target_page.get_image_bbox(img_info)
                                if img_bbox.contains(qr_center_point):
                                    matching_img_xref = img_info[0]
                                    break
                            except Exception: # Ignore errors getting bbox for some images
                                continue

                        if matching_img_xref:
                            # Extract the identified image object
                            print(f"    - Extracting image bytes from PDF object xref {matching_img_xref}.")
                            try:
                                base_image = shell_doc.extract_image(matching_img_xref)
                                qr_image_bytes = base_image["image"]
                            except Exception as extract_err:
                                 print(f"    - ERROR extracting image object {matching_img_xref}: {extract_err}. Will attempt render fallback.")
                                 qr_image_bytes = None # Ensure fallback is triggered
                        else:
                             qr_image_bytes = None # Ensure fallback is triggered if no match found

                        # If extraction from object failed or no object was found, fall back to rendering
                        if not qr_image_bytes:
                            print(f"    - WARNING: No matching image object found/extracted near {qr_pdf_bbox}. Rendering QR area from page pixmap.")
                            try:
                                # Define source area using integer pixel coordinates from pyzbar
                                qr_pix_area_irect_raw = fitz.IRect(qr_x, qr_y, qr_x + qr_w, qr_y + qr_h)
                                # Intersect with pixmap bounds using float Rects, then round
                                intersection_rect = fitz.Rect(qr_pix_area_irect_raw).intersect(fitz.Rect(pix.irect))
                                intersection_irect = intersection_rect.round()

                                if not intersection_irect.is_empty:
                                    # Create new pixmap and copy area
                                    qr_pix = fitz.Pixmap(fitz.csRGB, intersection_irect)
                                    qr_pix.copy(pix, intersection_irect)
                                    qr_image_bytes = qr_pix.tobytes("png")
                                    print("    - Successfully rendered QR area as PNG.")
                                else:
                                     print("    - ERROR: Clipped QR area is empty, cannot render.")
                                     found_qr = False
                            except Exception as render_err:
                                print(f"    - ERROR: Failed to render QR area: {render_err}")
                                traceback.print_exc()
                                found_qr = False

                        break # Stop processing QR codes once we've handled the first top-right one

        shell_doc.close()
        if qr_page_num != -1 and not qr_image_bytes:
             print(f"  - Found page {qr_page_num + 1} but did not successfully extract/render a QR code from the top-right via pyzbar.")

        return qr_page_num, qr_image_bytes

    except Exception as e:
        print(f"--- ERROR searching shell PDF {shell_pdf_path} for {invoice_number} using pyzbar ---")
        traceback.print_exc()
        print(f"--- END ERROR ---")
        return -1, None

In [5]:
def add_qr_to_invoice(invoice_path, output_path, qr_image_bytes):
    """
    Adds the QR code image bytes centered horizontally and placed
    a fixed distance from the top edge of the first page of the invoice PDF.
    """
    try:
        invoice_doc = fitz.open(invoice_path)
        if len(invoice_doc) == 0:
             print(f"  - Error adding QR code to {invoice_path}: PDF has no pages.")
             return

        page = invoice_doc[0]
        page_rect = page.rect

        # --- Define QR Code Size and Placement ---
        qr_width = 80  # Adjust width of pasted QR code if needed
        qr_height = 80 # Adjust height of pasted QR code if needed

        # Horizontal centering
        x0 = (page_rect.width - qr_width) / 2

        # Vertical position: Fixed margin from the top
        # Adjust this value to move placement up or down
        fixed_top_margin = 110 # Start 50 points down from the top edge (y=0)
        y0 = fixed_top_margin

        # Calculate bottom-right coordinates
        x1 = x0 + qr_width
        y1 = y0 + qr_height

        # Create the rectangle for the placement
        qr_code_rect = fitz.Rect(x0, y0, x1, y1)
        # print(f"  - Calculated placement rect: {qr_code_rect} on page size {page_rect.width}x{page_rect.height}")

        if qr_image_bytes:
            # Ensure the output directory exists before saving
            os.makedirs(os.path.dirname(output_path), exist_ok=True)
            # Insert image
            page.insert_image(qr_code_rect, stream=qr_image_bytes)
            # Save the modified document
            invoice_doc.save(output_path)
            print(f"  - Successfully added QR code to {output_path}")
        else:
             print(f"  - Skipping saving {output_path} - No QR code image data provided.")

        invoice_doc.close()

    except Exception as e:
        if isinstance(e, RuntimeError) and "cannot open output file" in str(e):
             print(f"Error adding QR code to {invoice_path}: Could not write to output path '{output_path}'. Check permissions or if the directory exists.")
        else:
             print(f"--- ERROR adding QR code to {invoice_path} ---")
             traceback.print_exc()
             print(f"--- END ERROR ---")

In [6]:
# --- Configuration ---
# !!! IMPORTANT: Update these paths for your environment !!!

# Example for Colab after uploading/mounting Drive:
invoice_folder = "/content/invoices/" # Folder containing original invoices (e.g., IN*.pdf)
shell_pdf_path = "/content/shell/shell.pdf" # Path to the PDF containing QR codes
output_folder = "/content/output" # Folder to save modified invoices

# Example for local machine (macOS/Linux):
# invoice_folder = "/Users/your_username/Desktop/invoices"
# shell_pdf_path = "/Users/your_username/Desktop/shell/shell.pdf"
# output_folder = "/Users/your_username/Desktop/output"

# Example for local machine (Windows):
# invoice_folder = r"C:\Users\your_username\Desktop\invoices" # Use raw string (r"...")
# shell_pdf_path = r"C:\Users\your_username\Desktop\shell\shell.pdf"
# output_folder = r"C:\Users\your_username\Desktop\output"

# --- Optional: Adjust QR Code size/placement in Cell 4 ---
# Variables 'qr_width', 'qr_height', 'fixed_top_margin' in add_qr_to_invoice function.

In [7]:
# --- Main Processing Loop ---
if not os.path.exists(output_folder):
    try:
        os.makedirs(output_folder)
        print(f"Created output folder: {output_folder}")
    except OSError as e:
        print(f"Error creating output folder '{output_folder}': {e}")
        invoice_files = [] # Prevent processing if output folder fails
        # exit() # Uncomment if you want to stop execution

# Get a list of PDF files in the invoice folder
if 'invoice_files' not in locals(): # Check if invoice_files might be undefined due to error above
    try:
        # Ensure path separators are correct for the OS (though Colab uses '/')
        invoice_folder = os.path.normpath(invoice_folder)
        output_folder = os.path.normpath(output_folder)
        shell_pdf_path = os.path.normpath(shell_pdf_path)

        print(f"Looking for invoices in: {invoice_folder}")
        invoice_files = [f for f in os.listdir(invoice_folder) if f.lower().endswith(".pdf")]
    except FileNotFoundError:
        print(f"Error: Invoice folder '{invoice_folder}' not found.")
        invoice_files = []
    except Exception as list_err:
         print(f"Error listing files in '{invoice_folder}': {list_err}")
         invoice_files = []

# Proceed only if paths exist and files are found
if not invoice_files:
     print(f"No PDF files found to process in: {invoice_folder}")
elif not os.path.exists(shell_pdf_path):
     print(f"Error: Shell PDF '{shell_pdf_path}' not found.")
else:
    print(f"\nStarting batch processing for {len(invoice_files)} invoices...")
    processed_count = 0
    skipped_count = 0
    error_count = 0

    for filename in invoice_files:
        try:
            # Construct full paths
            invoice_path = os.path.join(invoice_folder, filename)
            output_path = os.path.join(output_folder, filename)

            # Basic check to avoid processing files already in output
            if os.path.exists(output_path):
                # print(f"Skipping {filename}, already exists in output folder.")
                continue

             # Avoid processing the shell PDF itself if it's in the invoice folder
            if os.path.samefile(invoice_path, shell_pdf_path):
                print(f"Skipping {filename} as it is the shell PDF.")
                continue

            print(f"\nProcessing: {filename}")

            # 1. Extract Invoice Number
            invoice_num = extract_invoice_number(invoice_path)
            if not invoice_num:
                print(f"  - Could not extract invoice number from {filename}. Skipping.")
                skipped_count += 1
                continue
            # print(f"  - Found Invoice Number: {invoice_num}") # Optional print

            # 2. Find and Extract QR Code
            qr_page, qr_bytes = find_qr_page_and_image(shell_pdf_path, invoice_num)

            if qr_page == -1:
                 print(f"  - Could not find page containing {invoice_num} in {os.path.basename(shell_pdf_path)}. Skipping.")
                 skipped_count += 1
                 continue
            if not qr_bytes:
                 print(f"  - Could not extract QR code for {invoice_num}. Skipping.") # Message printed in function
                 skipped_count += 1
                 continue

            # 3. Add QR Code to Invoice
            add_qr_to_invoice(invoice_path, output_path, qr_bytes)
            if os.path.exists(output_path):
                 processed_count +=1
            else:
                 print(f"  - Failed to create output file for {filename}.")
                 error_count += 1 # Count as error if saving failed

        except Exception as loop_err:
            print(f"--- UNEXPECTED ERROR processing file {filename} ---")
            traceback.print_exc()
            print(f"--- END UNEXPECTED ERROR ---")
            error_count += 1
            continue # Move to next file

    print(f"\nBatch processing finished.")
    print(f"  Successfully processed: {processed_count}")
    print(f"  Skipped / No QR found: {skipped_count}")
    print(f"  Errors during processing: {error_count}")

Created output folder: /content/output
Looking for invoices in: /content/invoices

Starting batch processing for 5 invoices...

Processing: IN002025566.pdf
  - Found text match on page 47
  - Found potential QR Code in Top-Right via pyzbar. PDF BBox: Rect(616.5, 34.0, 808.0, 225.5)
    - Successfully rendered QR area as PNG.
  - Successfully added QR code to /content/output/IN002025566.pdf

Processing: IN002025567.pdf
  - Found text match on page 48
  - Found potential QR Code in Top-Right via pyzbar. PDF BBox: Rect(616.5, 34.0, 808.0, 225.5)
    - Successfully rendered QR area as PNG.
  - Successfully added QR code to /content/output/IN002025567.pdf

Processing: IN002025565.pdf
  - Found text match on page 46
  - Found potential QR Code in Top-Right via pyzbar. PDF BBox: Rect(616.5, 34.0, 808.0, 225.5)
    - Successfully rendered QR area as PNG.
  - Successfully added QR code to /content/output/IN002025565.pdf

Processing: IN002025564.pdf
  - Found text match on page 45
  - Found pote