In [1]:
import os
import tempfile
import shutil
import numpy as np
import torch
from PIL import Image
from scipy.spatial.distance import cosine
from transformers import ViTFeatureExtractor, ViTModel

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# Load pre-trained ViT model and feature extractor
feature_extractor = ViTFeatureExtractor.from_pretrained('google/vit-base-patch16-224-in21k')
vit_model = ViTModel.from_pretrained('google/vit-base-patch16-224-in21k')



In [3]:
def get_image_embedding(image_path):
    """
    Generate an embedding for an image using ViT.
    Args:
        image_path (str): Path to the image.
    Returns:
        np.ndarray: A 1D array representing the image embedding.
    """
    try:
        # Load and preprocess the image
        image = Image.open(image_path).convert("RGB")
        inputs = feature_extractor(images=image, return_tensors="pt")
        
        # Generate embedding using ViT
        with torch.no_grad():
            outputs = vit_model(**inputs)
            embedding = outputs.last_hidden_state[:, 0, :]  # [CLS] token embedding
        return embedding.squeeze().numpy()
    except Exception as e:
        print(f"Error generating embedding for {image_path}: {str(e)}")
        return None

def check_duplicates_across_categories(dataset_folders, threshold=0.1):
    """
    Check for duplicate images across multiple datasets with the same folder structure.
    Args:
        dataset_folders (list of str): Paths to the dataset base folders.
        threshold (float): Cosine similarity threshold for duplicates (default: 0.1).
    Returns:
        dict: Mapping of category to duplicates across all datasets.
    """
    all_duplicates = {}
    embeddings_by_category = {}
    
    # Validate input folders
    for folder in dataset_folders:
        if not os.path.exists(folder):
            raise ValueError(f"Dataset folder does not exist: {folder}")
    
    # Collect embeddings for all datasets
    for base_folder in dataset_folders:
        print(f"Processing folder: {base_folder}")
        
        for category in os.listdir(base_folder):
            category_path = os.path.join(base_folder, category)
            if not os.path.isdir(category_path):
                continue
                
            print(f"Processing category: {category}")
            
            if category not in embeddings_by_category:
                embeddings_by_category[category] = {}
            
            # Get list of image files
            image_files = [f for f in os.listdir(category_path) 
                         if f.lower().endswith(('.png', '.jpg', '.jpeg', '.gif', '.bmp'))]
            
            # Process each image
            for filename in image_files:
                image_path = os.path.join(category_path, filename)
                embedding = get_image_embedding(image_path)
                
                if embedding is not None:
                    key = (base_folder, filename)
                    embeddings_by_category[category][key] = embedding
    
    # Check for duplicates across datasets
    for category, embeddings in embeddings_by_category.items():
        print(f"\nChecking duplicates in category: {category}")
        duplicates = {}
        embedding_list = list(embeddings.items())
        total_comparisons = len(embedding_list) * (len(embedding_list) - 1) // 2
        
        if total_comparisons == 0:
            continue
            
        comparison_count = 0
        
        for i, (key1, embedding1) in enumerate(embedding_list):
            for j, (key2, embedding2) in enumerate(embedding_list[i + 1:], i + 1):
                try:
                    similarity = 1 - cosine(embedding1, embedding2)
                    
                    if similarity > (1 - threshold):
                        if key1 not in duplicates:
                            duplicates[key1] = []
                        duplicates[key1].append((key2, similarity))
                        
                    comparison_count += 1
                    if comparison_count % 1000 == 0:
                        print(f"Progress: {comparison_count}/{total_comparisons} comparisons")
                        
                except Exception as e:
                    print(f"Error comparing {key1} and {key2}: {str(e)}")
                    
        if duplicates:
            all_duplicates[category] = duplicates
            
    return all_duplicates

def display_duplicate_results(duplicates_across_datasets):
    """
    Display duplicate results in a more organized and detailed way.
    Args:
        duplicates_across_datasets (dict): Results from check_duplicates_across_categories
    """
    total_duplicates = 0
    
    for category, duplicates in duplicates_across_datasets.items():
        if duplicates:
            print(f"\n=== Duplicates in category '{category}' ===")
            for original_key, duplicate_list in duplicates.items():
                orig_dataset, orig_file = original_key
                print(f"\nOriginal: {orig_file} in {orig_dataset}")
                print("Duplicates:")
                for (dup_dataset, dup_file), similarity in duplicate_list:
                    print(f"  - {dup_file} in {dup_dataset}")
                    print(f"    Similarity: {similarity:.4f}")
                total_duplicates += len(duplicate_list)
        else:
            print(f"\nNo duplicates found in category '{category}'.")
            
    print(f"\nTotal duplicate pairs found: {total_duplicates}")

# Example usage:
def run_duplicate_check(dataset_folders, threshold=0.5):
    """
    Run the complete duplicate checking process.
    Args:
        dataset_folders (list): List of dataset folder paths
        threshold (float): Similarity threshold
    """
    try:
        print("Starting duplicate check process...")
        print(f"Using threshold: {threshold}")
        print("Dataset folders:", dataset_folders)
        
        duplicates = check_duplicates_across_categories(dataset_folders, threshold)
        display_duplicate_results(duplicates)
        
    except Exception as e:
        print(f"Error during duplicate checking: {str(e)}")

In [4]:
dataset_folders = [
    "D:/Project/VQA/archive (2)/images",
    "D:/Project/VQA/archive (3)/images"
]

In [5]:
threshold = 0.5
run_duplicate_check(dataset_folders, threshold)

Starting duplicate check process...
Using threshold: 0.5
Dataset folders: ['D:/Project/VQA/archive (2)/images', 'D:/Project/VQA/archive (3)/images']
Processing folder: D:/Project/VQA/archive (2)/images
Processing category: airplane
Processing category: bear
Processing category: bicycle
Processing category: bird
Processing category: boat
Processing category: book
Processing category: bottle




Processing category: bowl
Processing category: bus
Processing category: car
Processing category: cat
Processing category: clock
Processing category: cow
Processing category: cup
Processing category: dog
Processing category: elephant
Processing category: fork
Processing category: giraffe
Processing category: hair drier
Processing category: horse
Processing category: knife
Processing category: motorcycle
Processing category: scissors
Processing category: sheep
Processing category: spoon
Processing category: teddy bear
Processing category: toothbrush
Processing category: train
Processing category: truck
Processing category: vase
Processing category: wine glass
Processing category: zebra
Processing folder: D:/Project/VQA/archive (3)/images
Processing category: airplane
Processing category: bear
Processing category: bicycle
Processing category: bird
Processing category: boat
Processing category: book
Processing category: bottle
Processing category: bowl
Processing category: bus
Processing c

In [9]:
def remove_duplicates_in_category(category_path, duplicates):
    """
    Remove duplicate images in a category across multiple datasets.
    
    Args:
        category_path (str): Path to the category folder
        duplicates (dict): Dictionary of duplicate mappings from check_duplicates_across_categories
    
    Returns:
        tuple: (num_removed, num_errors) - Count of removed files and errors encountered
    """
    num_removed = 0
    num_errors = 0
    
    for original_key, duplicate_list in duplicates.items():
        orig_dataset, orig_file = original_key
        orig_path = os.path.join(orig_dataset, category_path, orig_file)
        
        # Verify original exists before removing duplicates
        if not os.path.exists(orig_path):
            print(f"Warning: Original file not found: {orig_path}")
            continue
            
        for (dup_dataset, dup_file), similarity in duplicate_list:
            dup_path = os.path.join(dup_dataset, category_path, dup_file)
            
            try:
                if os.path.exists(dup_path):
                    os.remove(dup_path)
                    print(f"Removed duplicate: {dup_path}")
                    print(f"  Original preserved: {orig_path}")
                    print(f"  Similarity: {similarity:.4f}")
                    num_removed += 1
                else:
                    print(f"Warning: Duplicate file not found: {dup_path}")
                    
            except Exception as e:
                print(f"Error removing {dup_path}: {str(e)}")
                num_errors += 1
                
    return num_removed, num_errors

def remove_duplicates_across_all_categories(duplicates_across_datasets):
    """
    Remove duplicate images across all categories and datasets automatically.
    
    Args:
        duplicates_across_datasets (dict): Output from check_duplicates_across_categories
    
    Returns:
        dict: Summary of operations performed
    """
    summary = {
        'total_removed': 0,
        'total_errors': 0,
        'categories_processed': 0,
        'categories_with_duplicates': 0
    }
    
    print("Starting duplicate removal process...")
    
    for category, duplicates in duplicates_across_datasets.items():
        summary['categories_processed'] += 1
        
        if not duplicates:
            print(f"\nNo duplicates to remove in category '{category}'.")
            continue
            
        summary['categories_with_duplicates'] += 1
        print(f"\nProcessing category: {category}")
        
        num_removed, num_errors = remove_duplicates_in_category(
            category,
            duplicates
        )
        
        summary['total_removed'] += num_removed
        summary['total_errors'] += num_errors
        
        print(f"Category '{category}' summary:")
        print(f"  Files removed: {num_removed}")
        print(f"  Errors encountered: {num_errors}")
    
    print("\nOverall Summary:")
    print(f"Total categories processed: {summary['categories_processed']}")
    print(f"Categories with duplicates: {summary['categories_with_duplicates']}")
    print(f"Total files removed: {summary['total_removed']}")
    print(f"Total errors encountered: {summary['total_errors']}")
    
    return summary

def create_deduplicated_dataset(input_folders, output_base="/kaggle/working", threshold=0.5):
    """
    Create a deduplicated dataset in Kaggle working directory.
    
    Args:
        input_folders (list): List of input dataset folders
        output_base (str): Base output directory (default: /kaggle/working)
        threshold (float): Similarity threshold for duplicate detection
    
    Returns:
        dict: Summary of operations and path to deduplicated dataset
    """
    print("Starting deduplication process...")
    
    try:
        # Create temporary working directory
        with tempfile.TemporaryDirectory() as temp_dir:
            print(f"Created temporary directory: {temp_dir}")
            
            # First, copy all data to temporary directory maintaining structure
            temp_dataset_folders = []
            for input_folder in input_folders:
                try:
                    folder_name = os.path.basename(input_folder)
                    temp_dataset_path = os.path.join(temp_dir, folder_name)
                    
                    # Add dirs_exist_ok=True to handle existing directories
                    shutil.copytree(input_folder, temp_dataset_path, dirs_exist_ok=True)
                    temp_dataset_folders.append(temp_dataset_path)
                    print(f"Copied {input_folder} to temporary location")
                except Exception as e:
                    print(f"Error copying {input_folder}: {str(e)}")
                    continue
            
            if not temp_dataset_folders:
                raise ValueError("No datasets were successfully copied to temporary location")
            
            # Run duplicate detection on temporary copies
            print("\nStep 1: Detecting duplicates...")
            duplicates = check_duplicates_across_categories(temp_dataset_folders, threshold)
            
            # Remove duplicates in temporary location
            print("\nStep 2: Removing duplicates...")
            summary = remove_duplicates_across_all_categories(duplicates)
            
            # Create final output directory in working
            output_dir = os.path.join(output_base, "deduplicated_dataset")
            if os.path.exists(output_dir):
                print(f"Removing existing output directory: {output_dir}")
                shutil.rmtree(output_dir)
            os.makedirs(output_dir)
            
            # Merge all remaining files into single deduplicated dataset
            print("\nStep 3: Creating final deduplicated dataset...")
            files_copied = 0
            for temp_folder in temp_dataset_folders:
                for category in os.listdir(temp_folder):
                    category_path = os.path.join(temp_folder, category)
                    if not os.path.isdir(category_path):
                        continue
                        
                    # Create category directory in output if it doesn't exist
                    output_category_dir = os.path.join(output_dir, category)
                    os.makedirs(output_category_dir, exist_ok=True)
                    
                    # Copy remaining files
                    for filename in os.listdir(category_path):
                        try:
                            src = os.path.join(category_path, filename)
                            dst = os.path.join(output_category_dir, filename)
                            if not os.path.exists(dst):  # Avoid overwriting if file already exists
                                shutil.copy2(src, dst)
                                files_copied += 1
                        except Exception as e:
                            print(f"Error copying file {filename}: {str(e)}")
                            continue
            
            # Count final files
            final_count = sum(len(files) for _, _, files in os.walk(output_dir))
            
            # Add final statistics to summary
            summary['output_path'] = output_dir
            summary['final_file_count'] = final_count
            summary['files_copied'] = files_copied
            
            print("\nFinal Summary:")
            print(f"Output directory: {output_dir}")
            print(f"Total files in deduplicated dataset: {final_count}")
            print(f"Total duplicates removed: {summary['total_removed']}")
            print(f"Categories processed: {summary['categories_processed']}")
            
            return summary
            
    except Exception as e:
        print(f"Error in deduplication process: {str(e)}")
        return {
            'error': str(e),
            'total_removed': 0,
            'total_errors': 1,
            'categories_processed': 0,
            'categories_with_duplicates': 0,
            'final_file_count': 0
        }

def check_dataset_statistics(dataset_path):
    """
    Print statistics about the dataset.
    
    Args:
        dataset_path (str): Path to the dataset directory
    """
    stats = {'total_files': 0, 'categories': {}}
    
    for category in os.listdir(dataset_path):
        category_path = os.path.join(dataset_path, category)
        if not os.path.isdir(category_path):
            continue
            
        num_files = len([f for f in os.listdir(category_path) 
                        if os.path.isfile(os.path.join(category_path, f))])
        stats['categories'][category] = num_files
        stats['total_files'] += num_files
    
    print("\nDataset Statistics:")
    print(f"Total files: {stats['total_files']}")
    print("\nFiles per category:")
    for category, count in stats['categories'].items():
        print(f"  {category}: {count}")

In [7]:
output_base = "D:/Project/VQA"

In [10]:
summary = create_deduplicated_dataset(
        input_folders=dataset_folders,
        output_base=output_base,
        threshold=0.5
    )

Starting deduplication process...
Created temporary directory: C:\Users\DELL\AppData\Local\Temp\tmpq1rq0odf
Copied D:/Project/VQA/archive (2)/images to temporary location
Copied D:/Project/VQA/archive (3)/images to temporary location

Step 1: Detecting duplicates...
Processing folder: C:\Users\DELL\AppData\Local\Temp\tmpq1rq0odf\images
Processing category: airplane
Processing category: bear
Processing category: bicycle
Processing category: bird
Processing category: boat
Processing category: book
Processing category: bottle
Processing category: bowl
Processing category: bus
Processing category: car
Processing category: cat
Processing category: clock
Processing category: cow
Processing category: cup
Processing category: dog
Processing category: elephant
Processing category: fork
Processing category: giraffe
Processing category: hair drier
Processing category: horse
Processing category: knife
Processing category: motorcycle
Processing category: scissors
Processing category: sheep
Processi