In [8]:
import os
from torchvision.datasets import ImageFolder
from torchvision.transforms import ToTensor
from collections import Counter

dataset_dir = "./datasets/CARDIS-clean"

In [9]:
cardis = ImageFolder(root=dataset_dir, transform=ToTensor())

Dataset ImageFolder
    Number of datapoints: 48159
    Root location: ./datasets/CARDIS-clean
    StandardTransform
Transform: ToTensor()

In [7]:
for file_path, _ in cardis.samples:  
    if "copy" in os.path.basename(file_path).lower():
        try:
            os.remove(file_path)  # Delete the file
            print(f"Deleted: {file_path}")
        except Exception as e:
            print(f"Failed to delete {file_path}: {e}")

print("Processing completed.")

Deleted: ./datasets/CARDIS/b/1 - Copy (10) - Copy.jpg
Deleted: ./datasets/CARDIS/b/1 - Copy (2) - Copy - Copy - Copy.jpg
Deleted: ./datasets/CARDIS/b/1 - Copy (2) - Copy - Copy.jpg
Deleted: ./datasets/CARDIS/b/1 - Copy (2) - Copy.jpg
Deleted: ./datasets/CARDIS/b/1 - Copy (2).jpg
Deleted: ./datasets/CARDIS/b/1 - Copy (3) - Copy - Copy - Copy.jpg
Deleted: ./datasets/CARDIS/b/1 - Copy (3) - Copy - Copy.jpg
Deleted: ./datasets/CARDIS/b/1 - Copy (3) - Copy.jpg
Deleted: ./datasets/CARDIS/b/1 - Copy (3).jpg
Deleted: ./datasets/CARDIS/b/1 - Copy (4) - Copy - Copy.jpg
Deleted: ./datasets/CARDIS/b/1 - Copy (4) - Copy.jpg
Deleted: ./datasets/CARDIS/b/1 - Copy (4).jpg
Deleted: ./datasets/CARDIS/b/1 - Copy (5) - Copy - Copy.jpg
Deleted: ./datasets/CARDIS/b/1 - Copy (5) - Copy.jpg
Deleted: ./datasets/CARDIS/b/1 - Copy (5).jpg
Deleted: ./datasets/CARDIS/b/1 - Copy (6) - Copy - Copy.jpg
Deleted: ./datasets/CARDIS/b/1 - Copy (6) - Copy.jpg
Deleted: ./datasets/CARDIS/b/1 - Copy (6).jpg
Deleted: ./datase

In [8]:
cardis = ImageFolder(root=dataset_dir, transform=ToTensor())

FileNotFoundError: Found no valid file for the classes v, z. Supported extensions are: .jpg, .jpeg, .png, .ppm, .bmp, .pgm, .tif, .tiff, .webp

In [4]:
from PIL import Image
import imagehash
import os
import pandas as pd

def compare_images_in_class(class_folder):
    """
    Compare all images in a given class folder to find duplicates and unique images.

    Args:
        class_folder (str): Path to the folder containing images of a single class.

    Returns:
        tuple: (number_of_duplicates, number_of_unique_images)
    """
    hashes = {}
    duplicates = 0

    for file in os.listdir(class_folder):
        file_path = os.path.join(class_folder, file)
        if os.path.isfile(file_path):
            with Image.open(file_path) as img:
                hash_value = imagehash.average_hash(img)
                if hash_value in hashes:
                    duplicates += 1
                else:
                    hashes[hash_value] = file_path

    unique_images = len(hashes)
    return duplicates, unique_images


def analyze_classes(dataset_folder):
    """
    Analyze all classes in the dataset and return the number of duplicates and unique images per class.

    Args:
        dataset_folder (str): Path to the dataset folder.

    Returns:
        dict: A dictionary with class names as keys and (duplicates, unique_images) as values.
    """
    results = {}
    for class_name in os.listdir(dataset_folder):
        class_folder = os.path.join(dataset_folder, class_name)
        if os.path.isdir(class_folder):
            duplicates, unique_images = compare_images_in_class(class_folder)
            results[class_name] = (duplicates, unique_images)

    return results

In [11]:
dataset_folder = "./datasets/DIDA-clean"
class_analysis = analyze_classes(dataset_folder)


df_results = pd.DataFrame.from_dict(class_analysis, orient="index", columns=["Duplicates", "Unique"])
df_results.index.name = "Labels"

html_output_file = "CARDIS.html"
df_results.to_html(html_output_file, index=True)

latex_output_file = "CARDIS.tex"
df_results.to_latex(latex_output_file, index=True, caption="CARDIS", label="tab:cardis-duplicate-class-analysis")

KeyboardInterrupt: 

In [22]:
# Example usage
dataset_folder = "./datasets/DIDA"
class_analysis = analyze_classes(dataset_folder)

# Convert the results to a DataFrame
df_results = pd.DataFrame.from_dict(class_analysis, orient="index", columns=["Duplicates", "Unique Images"])
df_results.index.name = "Labels"

html_output_file = "DIDA.html"
df_results.to_html(html_output_file, index=True)


In [1]:
from PIL import Image
import imagehash
import os

def compare_and_delete_duplicates(class_folder):
    """
    Compare all images in a given class folder to find and delete duplicates.

    Args:
        class_folder (str): Path to the folder containing images of a single class.

    Returns:
        tuple: (number_of_duplicates_deleted, number_of_unique_images)
    """
    hashes = {}
    duplicates = 0

    for file in os.listdir(class_folder):
        file_path = os.path.join(class_folder, file)
        if os.path.isfile(file_path):
            with Image.open(file_path) as img:
                hash_value = imagehash.average_hash(img)
                if hash_value in hashes:
                    # Duplicate found, delete the file
                    os.remove(file_path)
                    duplicates += 1
                else:
                    hashes[hash_value] = file_path

    unique_images = len(hashes)
    return duplicates, unique_images


def analyze_and_cleanup(dataset_folder):
    """
    Analyze all classes in the dataset, delete duplicates, and return the number of duplicates
    and unique images per class.

    Args:
        dataset_folder (str): Path to the dataset folder.

    Returns:
        dict: A dictionary with class names as keys and (duplicates_deleted, unique_images) as values.
    """
    results = {}
    for class_name in os.listdir(dataset_folder):
        class_folder = os.path.join(dataset_folder, class_name)
        if os.path.isdir(class_folder):
            duplicates, unique_images = compare_and_delete_duplicates(class_folder)
            results[class_name] = (duplicates, unique_images)

    return results


In [12]:
dataset_folder = "./datasets/DIDA-clean"
cleanup_results = analyze_and_cleanup(dataset_folder)

for class_name, (duplicates_deleted, unique_images) in cleanup_results.items():
    print(f"Class: {class_name} | Duplicates Deleted: {duplicates_deleted} | Unique Images: {unique_images}")


Class: 5 | Duplicates Deleted: 124 | Unique Images: 25565
Class: 6 | Duplicates Deleted: 101 | Unique Images: 25605
Class: 0 | Duplicates Deleted: 289 | Unique Images: 24751
Class: 2 | Duplicates Deleted: 15 | Unique Images: 25137
Class: 8 | Duplicates Deleted: 197 | Unique Images: 25167
Class: 1 | Duplicates Deleted: 1295 | Unique Images: 23707
Class: 9 | Duplicates Deleted: 137 | Unique Images: 24970
Class: 4 | Duplicates Deleted: 90 | Unique Images: 25318
Class: 3 | Duplicates Deleted: 31 | Unique Images: 25018
Class: 7 | Duplicates Deleted: 83 | Unique Images: 25260


In [13]:
import shutil

def remove_small_classes(dataset_folder, min_elements=30):
    """
    Remove classes (folders) that contain fewer than the specified number of elements.

    Args:
        dataset_folder (str): Path to the dataset folder.
        min_elements (int): Minimum number of elements required to keep a class.

    Returns:
        list: List of class names that were deleted.
    """
    deleted_classes = []

    for class_name in os.listdir(dataset_folder):
        class_folder = os.path.join(dataset_folder, class_name)
        if os.path.isdir(class_folder):
            # Count the number of files in the class folder
            num_files = len([f for f in os.listdir(class_folder) if os.path.isfile(os.path.join(class_folder, f))])
            
            # Check if the folder has fewer than the minimum required elements
            if num_files < min_elements:
                # Delete the folder and log its name
                shutil.rmtree(class_folder)
                deleted_classes.append(class_name)

    return deleted_classes

# Example Usage
dataset_folder = "./datasets/CARDIS-class-removed"
deleted_classes = remove_small_classes(dataset_folder, min_elements=30)

print(f"Deleted classes with fewer than 30 elements: {deleted_classes}")

Deleted classes with fewer than 30 elements: ['x', 'z', 'r', 'w', 'v', 'y']
