In [2]:
import os
import cv2
from tqdm.notebook import tqdm
def find_duplicates_sift(dir1, dir2, threshold=10, ratio=0.75):
    sift = cv2.SIFT_create()
    bf = cv2.BFMatcher(cv2.NORM_L2, crossCheck=False)

    def get_image_files(directory):
        supported_formats = ('.png', '.jpg', '.jpeg', '.bmp', '.gif', '.tiff')
        return [os.path.join(directory, f) for f in os.listdir(directory)
                if f.lower().endswith(supported_formats)]

    def compute_kp_desc(image_path):
        image = cv2.imread(image_path, cv2.IMREAD_GRAYSCALE)
        if image is None:
            return None, None
        keypoints, descriptors = sift.detectAndCompute(image, None)
        return keypoints, descriptors

    dir2_files = get_image_files(dir2)
    dir2_descriptors = []
    for file in tqdm(dir2_files):
        _, desc = compute_kp_desc(file)
        if desc is not None:
            dir2_descriptors.append((file, desc))

    dir1_files = get_image_files(dir1)
    duplicates = []

    for file1 in tqdm(dir1_files):
        _, desc1 = compute_kp_desc(file1)
        if desc1 is None:
            continue

        for file2, desc2 in dir2_descriptors:
            matches = bf.knnMatch(desc1, desc2, k=2)
            good_matches = [m for m, n in matches if m.distance < ratio * n.distance]
            if len(good_matches) >= threshold:
                duplicates.append(file1)
                break

    return duplicates

def delete_duplicates(duplicates):
    for file in duplicates:
        try:
            os.remove(file)
            print(f"Deleted: {file}")
        except Exception as e:
            print(f"Error deleting {file}: {e}")
    print(f"Total deleted images: {len(duplicates)}")




In [3]:

directory1 = "data_small/test/mask"
directory2 = "data_small/train/mask"
similarity_threshold = 10
ratio_test = 0.6

duplicates_found = find_duplicates_sift(directory1, directory2, threshold=similarity_threshold, ratio=ratio_test)
print(f"Found {len(duplicates_found)} duplicate(s).")
if duplicates_found:
    confirm = input("Do you want to delete these duplicates? (y/n): ").strip().lower()
    if confirm == 'y':
        delete_duplicates(duplicates_found)
    else:
        print("Deletion aborted.")
else:
    print("No duplicates found.")

0it [00:00, ?it/s]

  0%|          | 0/558 [00:00<?, ?it/s]

Found 0 duplicate(s).
No duplicates found.


In [5]:

directory1 = "data_small/test/nomask"
directory2 = "data_small/train/nomask"
similarity_threshold = 20
ratio_test = 0.6

duplicates_found = find_duplicates_sift(directory1, directory2, threshold=similarity_threshold, ratio=ratio_test)
print(f"Found {len(duplicates_found)} duplicate(s).")
if duplicates_found:
    confirm = input("Do you want to delete these duplicates? (y/n): ").strip().lower()
    if confirm == 'y':
        delete_duplicates(duplicates_found)
    else:
        print("Deletion aborted.")
else:
    print("No duplicates found.")

  0%|          | 0/3254 [00:00<?, ?it/s]

  0%|          | 0/574 [00:00<?, ?it/s]

Found 122 duplicate(s).
Deletion aborted.
