<a href="https://colab.research.google.com/github/marcory-hub/datasettools/blob/main/findDuplicates.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Delete duplicate images

Last accessed: 2025-12-15

## 1. Compile and transfere your images from your local computer to colab

1. On your local computer make a folder `images`.
2. Put all images from one insect type in the folder `images`.
3. Compile the folder to `images.zip`. On mac use `zip -r dataset.zip images -i '*.jpg' '*.txt' '*/'` to exclude macOS metadate from the zipped file.
4. Put the zipped images in you google drive
5. Continue with the next code blocks:

In [None]:
# Mount Google Drive
from google.colab import drive
drive.mount('/content/gdrive')

# Unzip file with images
!mkdir '/content/images'
!scp '/content/gdrive/MyDrive/images.zip' '/content/images.zip'
!unzip '/content/images.zip'

# Optional: Check is path is correct
!ls '/content/images/'

In [None]:
!pip install ImageHash
!pip install Pillow

## 2. Comparison of the images

- Default threshold: 0.80
- Check for real duplicates, and adjust the threshold in the next coding block to remove true duplicates and look-alikes.



In [None]:
# Precomputes hashes for all images: Uses the Discrete Cosine Transform (DCT) to focus on the low-frequency components of the image.
# Finds similar image pairs.
# Sorts and displays pairs with similarity scores above a threshold (0.85).
# Saves results to the output_file.


import os
import time
from concurrent.futures import ProcessPoolExecutor  # For parallel processing
import numpy as np
from PIL import Image
import imagehash  # For perceptual hashing of images
from tqdm import tqdm  # For progress visualization
import cv2
from google.colab.patches import cv2_imshow  # For displaying images in Google Colab

# Define output folder and file for saving results
output_folder = '/content/images'  # Directory where images are stored
output_file = 'similar_images.txt'  # File to store results of similar image pairs

# Function to compute perceptual hashes for an image
def compute_image_hash(image_path, hash_sizes=[8, 16, 32]):
    try:
        # Open image file and convert to grayscale
        with Image.open(image_path) as img:
            img = img.convert("L")
            hashes = []
            # Compute hashes for each specified size
            for size in hash_sizes:
                resized = img.resize((size * 4, size * 4), Image.LANCZOS)
                hashes.append(str(imagehash.phash(resized, hash_size=size)))
            return hashes
    except Exception:
        # Handle errors during hash computation
        return None

# Function to precompute hashes for all images
def precompute_hashes(image_files):
    hashes = {}
    # Compute hash for each image
    for img_file in image_files:
        img_path = os.path.join(output_folder, img_file)
        img_hash = compute_image_hash(img_path)
        if img_hash:
            hashes[img_file] = img_hash
    return hashes

# Function to compare hashes in a given chunk of data
def compare_hashes_chunk(chunk):
    similar_pairs = []
    # Iterate through pairs of images in the chunk
    for i in range(len(chunk)):
        for j in range(i + 1, len(chunk)):
            img1, hashes1 = chunk[i]
            img2, hashes2 = chunk[j]
            if hashes1 is not None and hashes2 is not None:
                # Calculate similarity for each hash size
                similarities = [1 - (int(h1, 16) ^ int(h2, 16)).bit_count() / (len(h1) * 4)
                                for h1, h2 in zip(hashes1, hashes2)]
                max_similarity = max(similarities)
                # Check if similarity exceeds threshold
                if max_similarity > 0.80:  # Threshold for similarity
                    similar_pairs.append((max_similarity, (img1, img2)))
    return similar_pairs

# Function to find all similar image pairs based on precomputed hashes
def find_similar_pairs(hashes):
    hash_items = list(hashes.items())
    # Divide data into chunks for parallel processing
    chunk_size = max(1, len(hash_items) // os.cpu_count())
    chunks = [hash_items[i:i + chunk_size] for i in range(0, len(hash_items), chunk_size)]

    # Use ProcessPoolExecutor to parallelize hash comparisons
    with ProcessPoolExecutor() as executor:
        results = list(tqdm(executor.map(compare_hashes_chunk, chunks), total=len(chunks), desc="Comparing hashes"))

    # Flatten results from all chunks
    return [item for sublist in results for item in sublist]

# Function to display an image with its filename using OpenCV
def display_image_with_filename(image_path):
    try:
        # Load and resize the image for display
        img = cv2.imread(image_path)
        img_resized = cv2.resize(img, (64, 64))
        filename = os.path.basename(image_path)

        # Create an overlay with the filename
        text_img = np.zeros((20, 64, 3), dtype=np.uint8)
        cv2.putText(text_img, filename, (0, 15), cv2.FONT_HERSHEY_SIMPLEX, 0.3, (255, 255, 255), 1)

        # Combine image and overlay
        combined_img = np.vstack((img_resized, text_img))

        # Display the image in Colab
        cv2_imshow(combined_img)
        print(f"Filename: {filename}")
    except Exception as e:
        # Handle errors during image display
        print(f"Error displaying image {image_path}: {e}")

# Function to process a list of images and find similar pairs
def process_images(image_list):
    start_time = time.time()  # Record start time

    # Precompute hashes for all images
    print("Precomputing hashes...")
    with ProcessPoolExecutor() as executor:
        chunk_size = max(1, len(image_list) // os.cpu_count())
        chunks = [image_list[i:i + chunk_size] for i in range(0, len(image_list), chunk_size)]
        results = list(tqdm(executor.map(precompute_hashes, chunks), total=len(chunks), desc="Computing hashes"))

    # Combine results into a single dictionary
    image_hashes = {k: v for d in results for k, v in d.items()}

    # Find pairs of similar images
    print("Finding similar pairs...")
    similar_pairs = find_similar_pairs(image_hashes)

    # Sort similar pairs by similarity score
    similar_pairs.sort(reverse=True, key=lambda x: x[0])

    # Display and save similar pairs
    print("\nSimilar pairs above 0.85 threshold:")
    with open(output_file, 'w') as f:
        for i, (similarity, (img1_name, img2_name)) in enumerate(similar_pairs):
            print(f"\nPair {i+1} with similarity: {similarity:.4f}")
            f.write(f"Pair {i+1} with similarity: {similarity:.4f}\n")

            img1_path = os.path.join(output_folder, img1_name)
            img2_path = os.path.join(output_folder, img2_name)

            print("Image 1:")
            print(f"Filename: {img1_name}")
            f.write(f"Image 1: {img1_name}\n")
            display_image_with_filename(img1_path)

            print("\nImage 2:")
            print(f"Filename: {img2_name}")
            f.write(f"Image 2: {img2_name}\n\n")
            display_image_with_filename(img2_path)

    # Display summary and execution time
    print(f"\nTotal similar pairs found: {len(similar_pairs)}")
    print(f"Results have been saved to {output_file}")
    end_time = time.time()
    print(f"Comparison complete! Time taken: {end_time - start_time:.2f} seconds.")

# Main function to initiate image processing
def main():
    print("Processing all images in the folder...")
    all_images = os.listdir(output_folder)  # Get list of all images in the output folder
    process_images(all_images)  # Process images for similarity

# Run the main function if the script is executed
if __name__ == "__main__":
    main()

## 3. Remove duplicate images from folder images

Find the lowest similarity where the images are real duplicaties and adjust code below. The images with the longest name are removed.

In [None]:
import os
import imagehash
from PIL import Image

# Output folder path (change as needed)
output_folder = '/content/images'

def compute_image_hash(image_path):
    try:
        with Image.open(image_path) as img:
            img = img.convert("L").resize((8, 8), Image.LANCZOS)  # Resize for hash computation
            return imagehash.phash(img) # Return the hash object directly
    except Exception as e:
        print(f"Error processing image {image_path}: {e}")
        return None

def find_and_remove_duplicates(folder, similarity_threshold=0.8100):
    all_image_paths = []
    for filename in os.listdir(folder):
        if not filename.startswith('.'):  # Skip hidden files
            file_path = os.path.join(folder, filename)
            if os.path.isfile(file_path):
                all_image_paths.append(file_path)

    # Store filepath -> hash_object mapping for all files
    file_hashes = {}
    for img_path in all_image_paths:
        file_hashes[img_path] = compute_image_hash(img_path)

    # Keep track of files that have been successfully removed
    removed_files_set = set()
    total_removed = 0

    # Iterate through all unique pairs of images
    num_images = len(all_image_paths)
    for i in range(num_images):
        for j in range(i + 1, num_images):
            path1 = all_image_paths[i]
            path2 = all_image_paths[j]

            # Skip if either file has already been removed
            if path1 in removed_files_set or path2 in removed_files_set:
                continue

            hash_obj1 = file_hashes.get(path1)
            hash_obj2 = file_hashes.get(path2)

            if hash_obj1 is None or hash_obj2 is None:
                continue

            distance = hash_obj1 - hash_obj2
            # For 8x8 pHash, hash.size is 64
            similarity = 1 - distance / (hash_obj1.hash.size)

            if similarity >= similarity_threshold:
                fname1 = os.path.basename(path1)
                fname2 = os.path.basename(path2)

                # Determine which file to remove (the one with the longer name, or lexicographically larger if names are equal)
                to_remove_path = None
                if len(fname1) > len(fname2):
                    to_remove_path = path1
                elif len(fname2) > len(fname1):
                    to_remove_path = path2
                else:
                    to_remove_path = max(path1, path2) # Lexicographically larger path

                if to_remove_path and to_remove_path not in removed_files_set:
                    print(f"Removing duplicate image: {os.path.basename(to_remove_path)}")
                    os.remove(to_remove_path)
                    removed_files_set.add(to_remove_path)
                    total_removed += 1

    print(f"\nTotal number of duplicate images removed: {total_removed}")

if __name__ == "__main__":
    # Call the function with the desired similarity threshold
    find_and_remove_duplicates(output_folder, similarity_threshold=0.8100)


## Compile the file
Download `images_noduplicates.zip` before closing colab!

In [None]:
import os
import zipfile

# Define the folder containing images and the name of the zip file
output_folder = '/content/images'
zip_file_name = '/content/images_noduplicates.zip'  # Specify the path for the zip file

def zip_images(folder, zip_file):
    with zipfile.ZipFile(zip_file, 'w') as zipf:
        # Add all images in the folder to the zip file
        for filename in os.listdir(folder):
            file_path = os.path.join(folder, filename)
            if os.path.isfile(file_path):
                zipf.write(file_path, arcname=filename)
    print(f"Created zip file: {zip_file}")

# Call the function to zip images
zip_images(output_folder, zip_file_name)