In [1]:
import os
import numpy as np
import torch
from PIL import Image
from scipy.spatial.distance import cosine
from transformers import ViTFeatureExtractor, ViTModel



In [2]:
# Load pre-trained ViT model and feature extractor
feature_extractor = ViTFeatureExtractor.from_pretrained('google/vit-base-patch16-224-in21k')
vit_model = ViTModel.from_pretrained('google/vit-base-patch16-224-in21k')

preprocessor_config.json:   0%|          | 0.00/160 [00:00<?, ?B/s]



config.json:   0%|          | 0.00/502 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/346M [00:00<?, ?B/s]

In [3]:
def get_image_embedding(image_path):
    """
    Generate an embedding for an image using ViT.

    Args:
        image_path (str): Path to the image.

    Returns:
        np.ndarray: A 1D array representing the image embedding.
    """
    # Load and preprocess the image
    image = Image.open(image_path).convert("RGB")
    inputs = feature_extractor(images=image, return_tensors="pt")

    # Generate embedding using ViT
    with torch.no_grad():
        outputs = vit_model(**inputs)
        embedding = outputs.last_hidden_state[:, 0, :]  # [CLS] token embedding

    return embedding.squeeze().numpy()

def check_duplicates_in_category(folder_path, threshold=0.1):
    """
    Check for duplicate images in a folder based on embeddings.

    Args:
        folder_path (str): Path to the folder containing images of a single category.
        threshold (float): Cosine similarity threshold for duplicates (default: 0.1).

    Returns:
        dict: Mapping of duplicate images and their similar counterparts.
    """
    embeddings = {}
    duplicates = {}

    for filename in os.listdir(folder_path):
        if not filename.lower().endswith(('.png', '.jpg', '.jpeg')):
            continue

        image_path = os.path.join(folder_path, filename)
        embedding = get_image_embedding(image_path)

        # Compare with existing embeddings
        for existing_filename, existing_embedding in embeddings.items():
            similarity = 1 - cosine(embedding, existing_embedding)
            if similarity > (1 - threshold):  # Check if similarity exceeds the threshold
                duplicates[filename] = existing_filename
                break
        else:
            # Add the new image to embeddings if no duplicate is found
            embeddings[filename] = embedding

    return duplicates

def check_duplicates_in_all_categories(base_folder, threshold=0.1):
    """
    Check for duplicate images in each category folder under a base directory.

    Args:
        base_folder (str): Path to the base folder containing category folders.
        threshold (float): Cosine similarity threshold for duplicates (default: 0.1).

    Returns:
        dict: Mapping of categories to their duplicate mappings.
    """
    all_duplicates = {}

    for category in os.listdir(base_folder):
        category_path = os.path.join(base_folder, category)
        if not os.path.isdir(category_path):
            continue

        print(f"Checking duplicates in category: {category}")
        duplicates = check_duplicates_in_category(category_path, threshold)
        all_duplicates[category] = duplicates

    return all_duplicates

In [4]:
base_folder = "/kaggle/input/18012025-vqa/images"  
threshold = 0.1       
duplicates_by_category = check_duplicates_in_all_categories(base_folder, threshold)

# Display duplicates
for category, duplicates in duplicates_by_category.items():
    if duplicates:
        print(f"Duplicates in category '{category}':")
        for duplicate, original in duplicates.items():
            print(f"  {duplicate} is a duplicate of {original}")
    else:
        print(f"No duplicates found in category '{category}'.")


Checking duplicates in category: motorcycle
Checking duplicates in category: airplane
Checking duplicates in category: horse




Checking duplicates in category: bus
Checking duplicates in category: book
Checking duplicates in category: clock
Checking duplicates in category: truck
Checking duplicates in category: wine glass
Checking duplicates in category: dog
Checking duplicates in category: bear
Checking duplicates in category: spoon
Checking duplicates in category: cup
Checking duplicates in category: fork
Checking duplicates in category: bird
Checking duplicates in category: car
Checking duplicates in category: boat
Checking duplicates in category: elephant
Checking duplicates in category: sheep
Checking duplicates in category: zebra
Checking duplicates in category: teddy bear
Checking duplicates in category: cow
Checking duplicates in category: bicycle
Checking duplicates in category: giraffe
Checking duplicates in category: scissors
Checking duplicates in category: toothbrush
Checking duplicates in category: vase
Checking duplicates in category: hair drier
Checking duplicates in category: bowl
Checking dup