In [None]:
%matplotlib inline

import os
import cv2
import numpy as np
from PIL import Image
from collections import defaultdict

def compute_dhash(image, hash_size=8):
    resized = cv2.resize(image, (hash_size + 1, hash_size), interpolation=cv2.INTER_LINEAR)
    diff = resized[:, 1:] > resized[:, :-1]
    return diff.flatten().astype(np.uint8)

def hamming_distance(hash1, hash2):
    return np.mean(hash1 != hash2)

        
class Duplicate:
    def __init__(self, hash_size=8):
        self.image_hashes = {}
        self.hash_size=hash_size

    def hash_image_folder(self, image_folder):
        n_image = 0
    
        for filename in os.listdir(image_folder):
            file_path = os.path.join(image_folder, filename)
            
            try:
                # Load the image and convert to grayscale
                image = Image.open(file_path).convert('L')
                image = np.array(image)
                img_hash = compute_dhash(image, self.hash_size)
                self.image_hashes[file_path] = img_hash
                n_image += 1
            except Exception as e:
                print(f"Error processing {file_path}: {e}")
                continue
        print(f'images hashed: {n_image}')

            
    def find_near_duplicates(self, threshold=0.1, start_idx=0):
        duplicates = defaultdict(list)

        # Compare hashes for near-duplicates
        file_list = list(self.image_hashes.keys())
        for i, file1 in enumerate(file_list):
            for j in range(max(i + 1, start_idx), len(file_list)):
                file2 = file_list[j]
                dist = hamming_distance(self.image_hashes[file1], self.image_hashes[file2])
                if dist <= threshold:
                    duplicates[file1].append(file2)
    
        return duplicates

In [None]:
dup = Duplicate(8)

In [None]:
image_folder = "/media/charles/DATA/Programs/datasetManipulation/datasets/Dataset-ViPARE-22/test/images"
dup.hash_image_folder(image_folder)

In [None]:
image_folder = "/media/charles/DATA/Programs/datasetManipulation/datasets/Dataset-ViPARE-22/valid/images"
dup.hash_image_folder(image_folder)

In [None]:
image_folder = "/media/charles/DATA/Programs/datasetManipulation/datasets/Dataset-ViPARE-22/train/images"
dup.hash_image_folder(image_folder)

In [None]:
image_folder = "/media/charles/DATA/Programs/datasetManipulation/datasets/newImages"
dup.hash_image_folder(image_folder)

In [None]:
duplicates = dup.find_near_duplicates(0.1, 345+790+3802)

# Print results
for image, near_duplicates in duplicates.items():
    print(f"Image: {image}")
    for duplicate in near_duplicates:
        print(f"  -> Near-duplicate: {duplicate}")
print(len(duplicates))
sum(list(map(len, duplicates.values())))

In [None]:
from PIL import Image
import matplotlib.pyplot as plt

import os

def display_duplicates(duplicates):
    def extract_dataset(path):
        # Check for "train", "valid", or "test" in the path
        if "train" in path:
            return "train"
        elif "valid" in path:
            return "valid"
        elif "test" in path:
            return "test"
        return None

    for image_path, near_duplicates in duplicates.items():
        original_image = Image.open(image_path)
        
        num_duplicates = len(near_duplicates)
        fig, axes = plt.subplots(1, num_duplicates + 1, figsize=(5 * (num_duplicates + 1), 5))
        filename = image_path.split("/")[-1]
        print(f"Original: {extract_dataset(image_path)} - {filename}")

        axes[0].imshow(original_image)
        axes[0].axis("off")
        
        for i, duplicate_path in enumerate(near_duplicates):
            duplicate_image = Image.open(duplicate_path)
            axes[i + 1].imshow(duplicate_image)
            axes[i + 1].axis("off")
            filename = duplicate_path.split("/")[-1]
            print(f"Duplicate: {extract_dataset(duplicate_path)} - {filename}")

        plt.tight_layout()
        plt.show()


In [None]:
dup_10 = dict(list(duplicates.items())[0:50])
display_duplicates(dup_10)