In [1]:
# @title removes duplicate files in a directory, keeping the oldest version
import os
import hashlib
import shutil
from collections import defaultdict

def get_file_hash(filepath):
    """
    Calculates the SHA-256 hash of a file.
    Returns the hash or None if an error occurs.
    """
    sha256_hash = hashlib.sha256()
    try:
        with open(filepath, "rb") as f:
            # Read and update hash in chunks of 4K
            for byte_block in iter(lambda: f.read(4096), b""):
                sha256_hash.update(byte_block)
        return sha256_hash.hexdigest()
    except Exception as e:
        print(f"Error reading {filepath}: {e}")
        return None

def find_and_move_duplicates():
    """
    Finds duplicate files in the current directory and all subdirectories.
    Keeps the file with the oldest modification time and moves the rest
    to a 'dupeshit' folder.
    """

    # Use os.getcwd() to run in the directory where the script is located
    start_dir = os.getcwd()
    dupes_dir = os.path.join(start_dir, "dupeshit")

    # --- 1. Setup ---
    try:
        os.makedirs(dupes_dir, exist_ok=True)
        print(f"Duplicates will be moved to: {dupes_dir}\n")
    except Exception as e:
        print(f"Error creating directory {dupes_dir}: {e}")
        return

    # --- 2. Group files by size (fast pre-filter) ---
    size_groups = defaultdict(list)
    print(f"Scanning files in {start_dir}...")

    for dirpath, dirnames, filenames in os.walk(start_dir):
        # Don't scan the 'dupeshit' folder itself
        if dirpath.startswith(dupes_dir):
            continue

        for filename in filenames:
            filepath = os.path.join(dirpath, filename)

            # Don't check the script file itself
            if os.path.realpath(filepath) == os.path.realpath(__file__):
                continue

            try:
                # Use os.path.realpath to follow symlinks (shortcuts)
                real_path = os.path.realpath(filepath)
                if not os.path.isfile(real_path):
                    continue

                file_size = os.path.getsize(real_path)
                if file_size > 0: # Ignore empty files
                    size_groups[file_size].append(real_path)
            except FileNotFoundError:
                continue # Skip broken links

    # --- 3. Group by content hash (slower, accurate check) ---
    hash_groups = defaultdict(list)
    total_files_checked = 0

    print("Checking for duplicate content (this may take a while)...")
    for size, files in size_groups.items():
        if len(files) < 2:
            # Not a duplicate if only one file of this size
            total_files_checked += 1
            continue

        # Only hash files that are potential duplicates
        for filepath in files:
            file_hash = get_file_hash(filepath)
            if file_hash:
                hash_groups[file_hash].append(filepath)
            total_files_checked += 1

    # --- 4. Find oldest and move the rest ---
    total_moved = 0
    total_kept = 0

    print("\n--- Processing Duplicates ---")

    for file_hash, files in hash_groups.items():
        if len(files) < 2:
            # This file had a unique hash
            total_kept += 1
            continue

        # We have true duplicates. Now, find the oldest.
        # Create a list of (modification_time, filepath) tuples
        file_times = []
        for filepath in files:
            try:
                mtime = os.path.getmtime(filepath)
                file_times.append((mtime, filepath))
            except FileNotFoundError:
                continue

        # Sort by time (oldest first)
        file_times.sort(key=lambda x: x[0])

        # Keep the first file (the oldest)
        file_to_keep = file_times[0][1]
        print(f"\nGroup (Hash: {file_hash[:10]}...):")
        print(f"  Keeping (Oldest): {file_to_keep}")
        total_kept += 1

        # Move all the others
        dupes_to_move = file_times[1:]
        for mtime, filepath in dupes_to_move:
            try:
                filename = os.path.basename(filepath)
                dest_path = os.path.join(dupes_dir, filename)

                # Handle name conflicts in the 'dupeshit' folder
                counter = 1
                while os.path.exists(dest_path):
                    name, ext = os.path.splitext(filename)
                    dest_path = os.path.join(dupes_dir, f"{name}_{counter}{ext}")
                    counter += 1

                shutil.move(filepath, dest_path)
                print(f"  Moved:            {filepath}")
                total_moved += 1
            except Exception as e:
                print(f"  Error moving {filepath}: {e}")

    print("\n--- Summary ---")
    print(f"Total Files Scanned: {total_files_checked}")
    print(f"Unique Files Kept:   {total_kept}")
    print(f"Duplicates Moved:    {total_moved}")

if __name__ == "__main__":
    find_and_move_duplicates()

In [None]:
# @title Find and Move Duplicates --- IGNORE --- and keeps the oldest version moves the rest to a 'dupeshit' folder
import os
import hashlib
import shutil
from collections import defaultdict

def get_file_hash(filepath):
    """
    Calculates the SHA-256 hash of a file.
    Returns the hash or None if an error occurs.
    """
    sha256_hash = hashlib.sha256()
    try:
        with open(filepath, "rb") as f:
            # Read and update hash in chunks of 4K
            for byte_block in iter(lambda: f.read(4096), b""):
                sha256_hash.update(byte_block)
        return sha256_hash.hexdigest()
    except Exception as e:
        print(f"Error reading {filepath}: {e}")
        return None

def find_and_move_duplicates():
    """
    Finds duplicate files in the current directory and all subdirectories.
    Keeps the file with the oldest modification time and moves the rest
    to a 'dupeshit' folder.
    """

    # Use os.getcwd() to run in the directory where the script is located
    start_dir = os.getcwd()
    dupes_dir = os.path.join(start_dir, "dupeshit")

    # --- 1. Setup ---
    try:
        os.makedirs(dupes_dir, exist_ok=True)
        print(f"Duplicates will be moved to: {dupes_dir}\n")
    except Exception as e:
        print(f"Error creating directory {dupes_dir}: {e}")
        return

    # --- 2. Group files by size (fast pre-filter) ---
    size_groups = defaultdict(list)
    print(f"Scanning files in {start_dir}...")

    for dirpath, dirnames, filenames in os.walk(start_dir):
        # Don't scan the 'dupeshit' folder itself
        if dirpath.startswith(dupes_dir):
            continue

        for filename in filenames:
            filepath = os.path.join(dirpath, filename)

            # Don't check the script file itself
            if os.path.realpath(filepath) == os.path.realpath(__file__):
                continue

            try:
                # Use os.path.realpath to follow symlinks (shortcuts)
                real_path = os.path.realpath(filepath)
                if not os.path.isfile(real_path):
                    continue

                file_size = os.path.getsize(real_path)
                if file_size > 0: # Ignore empty files
                    size_groups[file_size].append(real_path)
            except FileNotFoundError:
                continue # Skip broken links

    # --- 3. Group by content hash (slower, accurate check) ---
    hash_groups = defaultdict(list)
    total_files_checked = 0

    print("Checking for duplicate content (this may take a while)...")
    for size, files in size_groups.items():
        if len(files) < 2:
            # Not a duplicate if only one file of this size
            total_files_checked += 1
            continue

        # Only hash files that are potential duplicates
        for filepath in files:
            file_hash = get_file_hash(filepath)
            if file_hash:
                hash_groups[file_hash].append(filepath)
            total_files_checked += 1

    # --- 4. Find oldest and move the rest ---
    total_moved = 0
    total_kept = 0

    print("\n--- Processing Duplicates ---")

    for file_hash, files in hash_groups.items():
        if len(files) < 2:
            # This file had a unique hash
            total_kept += 1
            continue

        # We have true duplicates. Now, find the oldest.
        # Create a list of (modification_time, filepath) tuples
        file_times = []
        for filepath in files:
            try:
                mtime = os.path.getmtime(filepath)
                file_times.append((mtime, filepath))
            except FileNotFoundError:
                continue

        # Sort by time (oldest first)
        file_times.sort(key=lambda x: x[0])

        # Keep the first file (the oldest)
        file_to_keep = file_times[0][1]
        print(f"\nGroup (Hash: {file_hash[:10]}...):")
        print(f"  Keeping (Oldest): {file_to_keep}")
        total_kept += 1

        # Move all the others
        dupes_to_move = file_times[1:]
        for mtime, filepath in dupes_to_move:
            try:
                filename = os.path.basename(filepath)
                dest_path = os.path.join(dupes_dir, filename)

                # Handle name conflicts in the 'dupeshit' folder
                counter = 1
                while os.path.exists(dest_path):
                    name, ext = os.path.splitext(filename)
                    dest_path = os.path.join(dupes_dir, f"{name}_{counter}{ext}")
                    counter += 1

                shutil.move(filepath, dest_path)
                print(f"  Moved:            {filepath}")
                total_moved += 1
            except Exception as e:
                print(f"  Error moving {filepath}: {e}")

    print("\n--- Summary ---")
    print(f"Total Files Scanned: {total_files_checked}")
    print(f"Unique Files Kept:   {total_kept}")
    print(f"Duplicates Moved:    {total_moved}")

if __name__ == "__main__":
    find_and_move_duplicates()