In [1]:
# CONCEPTS

# hash all files in source and target
# source = {hash: [set of paths to files with that hash]}
# target = {hash: [set of paths to files with that hash]}
# for hash in target:
    # if hash not in source.keys():
        # delete all files in target[hash]
# for hash in source:
    # if hash not in target.keys():
        # copy all files in source[hash] to target
    # elif source[hash] != target[hash]:
        # new_files = source[hash] - target[hash]
        # outdated_files = target[hash] - source[hash]
        # for new_file in new_files:
            # if len(outdated_files):
                # file = outdated_files.pop()
                # os.rename(file, new_file)
            # else:
                # copy new_file to target

In [2]:
from pathlib import Path
from typing import Set, Callable, Dict, List
import hashlib
from fnmatch import fnmatch
import logging
from concurrent.futures import ThreadPoolExecutor
import os
import abc


from tqdm import tqdm

In [None]:
class FileHasher(abc.ABC):
    @abc.abstractmethod
    def __call__(self, file_path:Path) -> str:
        raise NotImplementedError

In [3]:
class ModificationStampHasher(FileHasher):
    def __init__(self, base_path: Path) -> None:
        self.base_path = base_path

    def __call__(self, file_path:Path) -> str:
        rel_path = file_path.relative_to(self.base_path)
        return f"{rel_path}_{os.path.getmtime(file_path)}"

class MD5Hasher(FileHasher):
    def __call__(self, file_path:Path) -> str:
        """Return the md5 hash of a file."""
        hasher = hashlib.md5()
        block_size = 64 * 1024

        with open(file_path, "rb") as file:
            while True:
                data = file.read(block_size)
                if not data:
                    break
                hasher.update(data)

        return hasher.hexdigest()

In [4]:
def validate_file(file_path: Path, excluded_paths: List[Path] = None) -> bool:
    """Return True if file is valid, False otherwise."""
    excluded_paths = excluded_paths or []
    try:
        if file_path.is_file() and not any(
            fnmatch(str(file_path), str(excluded_path)) for excluded_path in excluded_paths
        ):
            return True
    except OSError:
        logging.warning(f"Could not read {file_path}")


def get_all_files(folder_path: Path, excluded_paths: List[Path] = None, n_threads: int = 1) -> List[Path]:
    """Return a list of all valid files in a folder."""
    paths = list(folder_path.rglob("*"))

    with ThreadPoolExecutor(max_workers=n_threads) as executor:
        with tqdm(total=len(paths)) as pbar:
            futures = []
            for p in paths:
                future = executor.submit(validate_file, p, excluded_paths)
                future.add_done_callback(lambda _: pbar.update())
                futures.append(future)

            paths = [p for p, future in zip(paths, futures) if future.result()]

    return paths

In [5]:
def get_file_hashes(
    file_paths: List[Path], file_hash_func = None, n_threads: int = 1
) -> Dict[Path, str]:
    with ThreadPoolExecutor(max_workers=n_threads) as executor:
        with tqdm(total=len(file_paths)) as pbar:
            
            futures = []
            for file_path in file_paths:
                future = executor.submit(file_hash_func, file_path)
                future.add_done_callback(lambda _: pbar.update())
                futures.append(future)

            file_hashes = [future.result() for future in futures]

    result = dict(zip(file_paths, file_hashes))

    return result

# def get_file_hashes(
#     folder_path: Path, file_hash_func: Callable = md5_file_hash, n_threads: int = 1
# ) -> Dict[Path, str]:
#     """Return a dictionary of file hashes for all files in a folder."""
#     file_paths = get_all_files(folder_path, n_threads=n_threads)
#     return _get_file_hashes(file_paths, file_hash_func, n_threads=n_threads)



In [1]:
from typing import Any


class Syncer:
    def __init__(self, source_folder, target_folder) -> None:
        self.source_folder = source_folder
        self.target_folder = target_folder

    def __call__(self) -> None:
        

def get_source_and_target_hashes(
    source_folder_path: Path,
    target_folder_path: Path,
    file_hash_func: Callable = md5_file_hash,
    n_threads: int = 1,
    excluded_paths: List[Path] = None,
):
    logging.info(f"Syncing {source_folder_path} to {target_folder_path}")

    logging.info("Getting all valid files in source folder...")
    source_file_paths = get_all_files(
        source_folder_path, excluded_paths=excluded_paths, n_threads=n_threads
    )

    logging.info("Hashing all valid files in source folder...")
    source_hashes = get_file_hashes(
        source_file_paths, file_hash_func, n_threads=n_threads
    )

    logging.info("Getting all valid files in target folder...")
    target_file_paths = get_all_files(
        target_folder_path, excluded_paths=excluded_paths, n_threads=n_threads
    )

    logging.info("Hashing all valid files in target folder...")
    target_hashes = get_file_hashes(
        target_file_paths, file_hash_func, n_threads=n_threads
    )

    return source_hashes, target_hashes


def sync_by_path(source_hashes, target_hashes):
    # first delete all files in target that are not in source
    for path, hash in target_hashes.items():
        if path not in source_hashes.keys():
            (target_folder / path).unlink()
            

    for path, hash in source_hashes.items():
        if path in target_hashes.keys():
            if hash != target_hashes[path]:
                
        if path not in target_hashes.keys():
            
    
    # for path, hash in source_hashes.items():
    #     if path not in target_hashes.keys():
    #         yield path, hash
    # set(source_hashes.items())




    # outdated_files = target_hashes.keys() - source_hashes.keys()
    # logging.info(f'Deleting {len(outdated_files)} files from target folder...')
    # for hash in tqdm(outdated_files):
    #     for file_path in target_hashes[hash]:
    #         file_path.unlink()
    #     target_hashes.pop(hash)

    # new_files = source_hashes.keys() - target_hashes.keys()
    # logging.info(f'Copying {len(new_files)} files to target folder...')
    # for hash in tqdm(new_files):
    #     for file_path in source_hashes[hash]:
    #         # file_path.rename(target_folder_path / file_path.name)
    #     target_hashes[hash] = source_hashes[hash]

SyntaxError: incomplete input (2291194104.py, line 59)

In [7]:
p = Path("D:/Dokumente/") # 52GB, 353.000 Dateien


In [9]:
logging.basicConfig(level=logging.INFO)

sync_folders(p, None, n_threads=4, excluded_paths=None, file_hash_func=modification_name)

INFO:root:Syncing D:\Dokumente to None
INFO:root:Getting all valid files in source folder...
100%|██████████| 394935/394935 [02:00<00:00, 3280.28it/s]
INFO:root:Hashing all valid files in source folder...
  0%|          | 278/353731 [00:16<5:41:32, 17.25it/s] 
