In [1]:
import numpy as np
import cv2
import matplotlib.pyplot as plt
import math 
from PIL import Image
import imagehash

In [2]:
def load_image(img_path):
    """
    Loads grayscale image from a given path
    """
    img = cv2.imread(img_path,cv2.IMREAD_GRAYSCALE)
    return img

In [3]:
def binarize_image(img):
    """
    Binarizes a grayscale image using Otsu's thresholding.
    Returns a binary image (0 = black, 255 = white).
    """
    # Apply Otsu's thresholding
    _, binary = cv2.threshold(img, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)
    return cv2.bitwise_not(binary)

In [4]:
def get_bounding_boxes(binary_img, min_area=30):
    """
    Finds bounding boxes of connected components in a binary image.
    Filters out components smaller than min_area.
    Returns:
        List of bounding boxes: [ (x, y, w, h), ... ]
    """
    # Find connected components
    num_labels, _, stats, _ = cv2.connectedComponentsWithStats(binary_img)

    bounding_boxes = []
    
    for i in range(1, num_labels):  # start from 1 to skip the background
        x, y, w, h, area = stats[i]
        
        if area >= min_area:
            bounding_boxes.append((x, y, w, h))

    return bounding_boxes

In [5]:
def extract_patches(img, bboxes, padding):
    """
    Extract patches from the original grayscale image based on bounding boxes.
    
    Parameters:
    - img: Original grayscale image (uint8).
    - bboxes: List of (x, y, w, h) bounding boxes.
    - padding: Extra pixels to pad around each box.
    
    Returns:
    - List of patches (as small grayscale images, numpy arrays).
    """

    patches = []
    #h_img, w_img = img.shape

    for (x, y, w, h) in bboxes:
        # Add padding, but clip to image size
        x1 = x - padding
        y1 = y - padding
        x2 = x + w + padding
        y2 = y + h + padding

        patch = img[y1:y2, x1:x2]
        patches.append(patch)

    return patches

In [6]:
def boxes_debug(binary, boxes, out_folder):
    img_with_boxes = cv2.cvtColor(binary, cv2.COLOR_GRAY2BGR)  # Convert to color for colored boxes
    for (x, y, w, h) in boxes:
        cv2.rectangle(img_with_boxes, (x, y), (x+w, y+h), (0, 255, 0), 1)

    cv2.imwrite(f"{out_folder}/boxes_debug.png", img_with_boxes)


In [7]:
def patches_debug(patches, out_folder, patches_per_row=20):
    num_patches = len(patches)
    rows = math.ceil(num_patches / patches_per_row)
    fig, axs = plt.subplots(rows, patches_per_row, figsize=(patches_per_row, rows))

    for i in range(rows * patches_per_row):
        row = i // patches_per_row
        col = i % patches_per_row
        
        if rows == 1:
            ax = axs[col]  # special case when there is only 1 row
        elif patches_per_row == 1:
            ax = axs[row]
        else:
            ax = axs[row, col]

        ax.axis('off')

        if i < num_patches:
            patch = patches[i]
            ax.imshow(patch, cmap='gray')
        else:
            ax.set_visible(False)  # Hide empty subplots

    plt.tight_layout()
    plt.savefig(f"{out_folder}/patches_debug.png", dpi=300)
    plt.close()

In [8]:


def hash_patches(patches, hash_size=8):
    """
    Computes the average hash (aHash) for each patch.
    
    Parameters:
    - patches: list of grayscale patches (numpy arrays).
    - hash_size: size of the hash (default 8x8).
    
    Returns:
    - List of hashes (imagehash.ImageHash objects).
    """

    hashes = []
    
    for patch in patches:
        # Convert to PIL Image
        pil_patch = Image.fromarray(patch)
        
        # Resize to (hash_size x hash_size) inside average_hash if needed
        h = imagehash.average_hash(pil_patch, hash_size=hash_size)
        
        hashes.append(h)

    return hashes

In [9]:
from collections import defaultdict

def group_similar_patches(hashes, max_distance=0):
    """
    Groups patches whose hashes are identical (distance <= max_distance).
    
    Parameters:
    - hashes: List of imagehash.ImageHash objects.
    - max_distance: Maximum Hamming distance to consider as identical (default 0 = exact match).
    
    Returns:
    - groups: List of groups, each group is a list of patch indices.
    """

    groups = []
    used = set()

    for i in range(len(hashes)):
        if i in used:
            continue
        
        group = [i]
        used.add(i)
        
        for j in range(i+1, len(hashes)):
            if j in used:
                continue
            
            if hashes[i] - hashes[j] <= max_distance:
                group.append(j)
                used.add(j)
        
        groups.append(group)

    return groups


In [15]:
def plot_patch_groups(patches, groups, out_folder, save_dir="groups_debug"):
    """
    Plots patches belonging to the same group side-by-side and saves them.
    
    Parameters:
    - patches: List of grayscale patches (numpy arrays).
    - groups: List of groups (each group is a list of indices).
    - save_dir: Directory to save the group images.
    """

    import os
    os.makedirs(f"{out_folder}/{save_dir}", exist_ok=True)

    for idx, group in enumerate(groups):
        if len(group) <= 1:
            continue  # skip groups with only 1 patch (not duplicate)
        
        num_patches = len(group)
        fig, axs = plt.subplots(1, num_patches, figsize=(num_patches*2, 2))
        
        if num_patches == 1:
            axs = [axs]

        for i, patch_idx in enumerate(group):
            ax = axs[i]
            patch = patches[patch_idx]
            ax.imshow(patch, cmap='gray')
            ax.axis('off')
        
        plt.tight_layout()
        save_path = os.path.join(f"{out_folder}/{save_dir}", f"group_{idx}.png")
        plt.savefig(save_path, dpi=600)
        plt.close()


In [11]:
def all_vs_all_template_matching(patches, group, threshold):
    """
    Performs all vs all template matching inside a patch group to find exact matches.
    """

    matches = []

    for i in range(len(group)):
        for j in range(i + 1, len(group)):
            patch_i = patches[group[i]]
            patch_j = patches[group[j]]

            # Resize both patches to the same size (e.g., size of the smaller one)
            h = min(patch_i.shape[0], patch_j.shape[0])
            w = min(patch_i.shape[1], patch_j.shape[1])

            if h < 5 or w < 5:
                # Ignore if too small after resize
                continue

            resized_i = cv2.resize(patch_i, (w, h))
            resized_j = cv2.resize(patch_j, (w, h))

            # Now do template matching
            result = cv2.matchTemplate(resized_j, resized_i, cv2.TM_SQDIFF)
            minVal, max_val, _, _ = cv2.minMaxLoc(result)
            if minVal <= threshold:
                matches.append((group[i], group[j]))
            #if max_val >= threshold:
            #    matches.append((group[i], group[j]))

    return matches


In [12]:
def apply_all_vs_all_template_matching_to_groups(patches, patch_groups, threshold=0.9):
    """
    Applies all vs all template matching to each group of patches and prints results.
    
    Parameters:
    - patches: List of grayscale patches (numpy arrays).
    - patch_groups: List of patch groups (each group is a list of indices).
    - threshold: Minimum similarity for a match (default 0.9).
    
    Returns:
    - None
    """
    all_matches = []
    
    for idx, group in enumerate(patch_groups):
        #print(f"Processing group {idx}...")
        
        matches = all_vs_all_template_matching(patches, group, threshold)
        
        if matches:
            #print(f"Matches found in group {idx}: {matches}")
            all_matches.append((idx, matches))  # Store matches along with group index
    
    return all_matches

In [13]:
def visualize_matches_on_document(img, patch_bboxes, matches, out_folder):
    """
    Visualizes matching patches on the original document image by drawing bounding boxes.
    
    Parameters:
    - img: The original document image (grayscale).
    - patch_bboxes: List of bounding boxes for each patch.
    - matches: List of tuples, each containing indices of matched patches in the group.
    - save_path: Path to save the output image with visualized matches.
    """
    
    img_copy = cv2.cvtColor(img, cv2.COLOR_GRAY2BGR)

    # Draw bounding boxes for each matching pair
    for match in matches:
        idx1, idx2 = match
        
        # Get the bounding boxes for both matched patches
        x1, y1, w1, h1 = patch_bboxes[idx1]
        x2, y2, w2, h2 = patch_bboxes[idx2]

        # Draw bounding boxes on the image (in green)
        cv2.rectangle(img_copy, (x1, y1), (x1 + w1, y1 + h1), (0, 255, 0), 2)
        cv2.rectangle(img_copy, (x2, y2), (x2 + w2, y2 + h2), (0, 255, 0), 2)

        # Optionally, draw a line between the matching patches
        cv2.line(img_copy, (x1 + w1 // 2, y1 + h1 // 2), (x2 + w2 // 2, y2 + h2 // 2), (255, 0, 0), 2)

    # Save and show the image
    cv2.imwrite(f"{out_folder}/detection.png", img_copy)
    #print(f"Saved visualized matches image to {save_path}")


In [None]:
import glob
from pathlib import Path
import os

for file in glob.glob("../datasets/supatlantique/Copy-move/*.tif"):
    img_name = Path(file).stem
    out_folder = f"results/{img_name}"
    if os.path.exists(out_folder):
        continue
    Path(out_folder).mkdir(parents=True, exist_ok=True)
    img = load_image(file)
    binary = binarize_image(img)
    boxes = get_bounding_boxes(binary)
    boxes_debug(binary, boxes, out_folder)
    patches = extract_patches(img, boxes, padding=-1)
    patches_debug(patches, out_folder, patches_per_row=20)
    hashes = hash_patches(patches, hash_size=8)
    groups = group_similar_patches(hashes, max_distance=0)
    #plot_patch_groups(patches, groups, out_folder, save_dir="groups_debug")
    matches_per_group = apply_all_vs_all_template_matching_to_groups(patches, groups, threshold=0)
    all_matches = [match for _, matches in matches_per_group for match in matches]
    visualize_matches_on_document(img, boxes, all_matches, out_folder)

  ax.imshow(patch, cmap='gray')
  ax.imshow(patch, cmap='gray')


: 