In [None]:
import os
import hashlib
from tqdm.auto import tqdm

def calculate_file_checksum(file_path, block_size=65536, hash_algorithm=hashlib.md5):
    """Calculate the checksum of a file."""
    hash_object = hash_algorithm()
    with open(file_path, 'rb') as file:
        while True:
            data = file.read(block_size)
            if not data:
                break
            hash_object.update(data)
    return hash_object.hexdigest()

def find_duplicate_files(directory):
    """Find and return a dictionary of duplicate files in a directory."""
    duplicate_files = {}
    seen_files = set()

    for root, _, files in os.walk(directory):
        for file_name in files:
            file_path = os.path.join(root, file_name)
            file_checksum = calculate_file_checksum(file_path)

            if file_checksum in seen_files:
                duplicate_files.setdefault(file_checksum, []).append(file_path)
            else:
                seen_files.add(file_checksum)

    return duplicate_files

if __name__ == "__main__":
    target_directory = "/run/user/30046150/gvfs/smb-share:server=data2.thecrick.org,share=lab-gutierrezm/data/STPs/electron_microscopy/inputs/gutierrezm"
    duplicate_files = find_duplicate_files(target_directory)

    for checksum, file_paths in tqdm(duplicate_files.items()):
        if len(file_paths) > 1:
            print(f"Duplicates with checksum {checksum}:")
            for file_path in tqdm(file_paths):
                print(f"  - {file_path}")
