In [1]:
import os
import math
import fiftyone as fo
import fiftyone.brain as fob
from typing import Any, Dict, List, Optional, Tuple

## Visualize and remove duplicates

In [2]:
dataset_dir = "../data/Documents/"
train_dir = "../data/train/"
val_dir = "../data/val/"
dataset = fo.Dataset.from_dir(
    dataset_dir=val_dir,
    dataset_type=fo.types.ImageClassificationDirectoryTree
)

 100% |█████████████████| 206/206 [89.0ms elapsed, 0s remaining, 2.3K samples/s]   


In [None]:
fob.compute_uniqueness(dataset)
# print(dataset.first())

In [3]:
session = fo.launch_app(dataset)

In [None]:
# Sort in increasing order of uniqueness (least unique first)
dups_view = dataset.sort_by("uniqueness")

# Open view in the App
session.view = dups_view

In [None]:
def remove_dupes(sorted_data: List[fo.Sample], rtol: float = 1e-9) -> Tuple[List[fo.Sample], set[str]]:
    """This function removes duplicates in a dataset sorted by uniqueness value.
    Returns a de-duplicated dataset and a list of filepaths of duplicates"""
    assert len(sorted_data) >= 2, "No duplicates"
    dup_paths: set[str] = set()
    dedupe_dataset: List[fo.Sample] = []
    for i in range(0, len(sorted_data)-1):
        for j in range(i+1, len(sorted_data)):
            if math.isclose(sorted_data[i]['uniqueness'], sorted_data[j]['uniqueness'], rel_tol=rtol):
                dup_paths.add(sorted_data[j]['filepath'])
            else:
                break
        if sorted_data[i]['filepath'] not in dup_paths:
            dedupe_dataset.append(sorted_data[i])
    
    return dedupe_dataset, dup_paths

In [None]:
# Sort the dataset by uniqueness
sorted_dataset = sorted(dataset, key=lambda x: x["uniqueness"]) # list of samples

In [None]:
deduped_dataset_list, dup_paths = remove_dupes(sorted_dataset, 1e-5)
deduped_dataset = fo.Dataset()
deduped_dataset.add_samples(deduped_dataset_list)

### Remove duplicate images

In [None]:
dup_paths

In [None]:
not_found_cnt = 0
# Iterate through the list of paths and delete the files if they exist
for path in dup_paths:
    if os.path.exists(path):
        try:
            os.remove(path)  # Delete the file
            print(f"Deleted: {path}")
        except OSError as e:
            print(f"Error deleting {path}: {e}")
    else:
        print(f"File not found: {path}")
        not_found_cnt += 1

In [None]:
not_found_cnt

In [None]:
session = fo.launch_app(deduped_dataset)

In [None]:
deduped_dataset.name = "Deduped dataset - automatic1" # any name
deduped_dataset.persistent = True

## Delete duplicate images (depricated)

In [None]:
all_datasets = fo.list_datasets()
all_datasets

In [None]:
dataset = fo.load_dataset('Deduped dataset - automatic1')
session = fo.launch_app(dataset) # optionally view the saved dataset
unique_images = [sample['filepath'] for sample in dataset]
unique_images

**Note**: below code isn't working. Deletes entire directory

In [None]:
start_directory = "../data/Documents/"
# Recursively traverse the directory and delete files not in the list
for root, dirs, files in os.walk(start_directory):
    for filename in files:
        file_path = os.path.abspath(os.path.join(root, filename))
        # print(file_path)
        if filename not in unique_images:
            try:
                os.remove(file_path)  # Delete the file
                print(f"Deleted: {file_path}")
            except OSError as e:
                print(f"Error deleting {file_path}: {e}")

print("Deletion process complete.")