# Duplicate File Finder
See [README.md](README.md) for detailed documentation.

In [None]:
# --- Mode ---
# "new" = fresh scan, "resume_scan" = continue from checkpoint, "load_report" = skip to execution
mode = "new"

root_dir = "C:/path/to/scan/"          # used for "new" and "resume_scan"
report_path = "duplicate_report.txt"   # used for "load_report", also where new reports are saved
delete_mode = "trash"                  # "trash" or "permanent"
default_keep_rule = "oldest"           # "oldest", "newest", "shortest_path", "first_found"

In [None]:
# --- Scan Filters (set to None to disable) ---
ignore_extensions = None      # e.g. [".jpg", ".png"]
only_extensions = None        # e.g. [".pdf"] — mutually exclusive with ignore_extensions
max_size = None               # e.g. 50
max_size_unit = "MB"          # "KB", "MB", "GB"
min_size = None               # e.g. 1
min_size_unit = "KB"          # "KB", "MB", "GB"

## Scan

In [None]:
from tqdm.notebook import tqdm
from src.duplicate_finder import (
    ScanConfig, find_all_duplicate_files, generate_report,
    clear_checkpoint, validate_checkpoint,
)

grouped = {}

if mode in ("new", "resume_scan"):
    if mode == "new":
        clear_checkpoint(root_dir)
    elif mode == "resume_scan":
        assert validate_checkpoint(root_dir), "No valid checkpoint found in " + root_dir

    config = ScanConfig(
        root_dir=root_dir,
        resume=(mode == "resume_scan"),
        report_path=report_path,
        ignore_extensions=ignore_extensions,
        only_extensions=only_extensions,
        max_size=max_size,
        max_size_unit=max_size_unit,
        min_size=min_size,
        min_size_unit=min_size_unit,
        default_keep_rule=default_keep_rule,
    )

    pbar = tqdm(desc="Scanning", unit=" files")
    grouped = find_all_duplicate_files(config, on_progress=lambda f: pbar.update(1))
    pbar.close()

    print(f'Scan complete. Found {len(grouped)} duplicate group(s).')
else:
    print('Mode is load_report — skipping scan. Run the report cells below.')

## Report

In [None]:
if grouped:
    generate_report(grouped, report_path, keep_rule=default_keep_rule)

    total_files = sum(len(files) for files in grouped.values())
    to_remove_count = total_files - len(grouped)
    removable_size = sum(
        f['file_size'] for files in grouped.values() for f in files[1:]
    )

    def _fmt(size):
        if size >= 1024**3: return f'{size/1024**3:.1f} GB'
        if size >= 1024**2: return f'{size/1024**2:.1f} MB'
        if size >= 1024:    return f'{size/1024:.1f} KB'
        return f'{size} B'

    print(f'Report written to: {report_path}')
    print(f'{len(grouped)} groups, {to_remove_count} files to remove, {_fmt(removable_size)} recoverable')
else:
    print('No scan results. Using load_report mode or no duplicates found.')

Edit the report file if needed. Each group must keep at least one file.

In [None]:
from src.duplicate_finder import load_report, validate_report, get_files_to_remove

valid, message = validate_report(report_path)
if not valid:
    print(f'Report validation failed: {message}')
else:
    report = load_report(report_path)
    to_remove = get_files_to_remove(report)
    to_keep = len(report) - len(to_remove)
    print(f'{to_keep} file(s) to keep, {len(to_remove)} file(s) to remove.')

## Execute

In [None]:
from src.duplicate_finder import remove_files, trash_files

# Uncomment to execute:
# if delete_mode == "trash":
#     trash_files(to_remove)
#     print(f'Moved {len(to_remove)} file(s) to trash.')
# else:
#     remove_files(to_remove)
#     print(f'Permanently removed {len(to_remove)} file(s).')