scans for dupes

In [None]:
# @title  Find and Move Duplicates --- IGNORE ---   and keeps the oldest version moves the rest to a 'dupeshit' folder
import os
import hashlib
import shutil
from collections import defaultdict

def get_file_hash(filepath):
    """Calculates the SHA-256 hash of a file."""
    sha256_hash = hashlib.sha256()
    try:
        with open(filepath, "rb") as f:
            for byte_block in iter(lambda: f.read(4096), b""):
                sha256_hash.update(byte_block)
        return sha256_hash.hexdigest()
    except Exception as e:
        print(f"Error reading {filepath}: {e}")
        return None

def find_and_move_duplicates():
    """
    Finds duplicate files in the current directory and all subdirectories.
    Keeps the file with the oldest modification time and moves the rest
    to a 'dupeshit' folder.
    """

    start_dir = os.getcwd()
    dupes_dir = os.path.join(start_dir, "dupeshit")

    # --- 1. Setup ---
    try:
        os.makedirs(dupes_dir, exist_ok=True)
        print(f"Duplicates will be moved to: {dupes_dir}\n")
    except Exception as e:
        print(f"Error creating directory {dupes_dir}: {e}")
        return

    # --- 2. Group files by size (fast pre-filter) ---
    size_groups = defaultdict(list)
    print(f"Scanning files in {start_dir}...")

    for dirpath, dirnames, filenames in os.walk(start_dir):
        # Don't scan the 'dupeshit' folder itself
        if dirpath.startswith(dupes_dir):
            continue

        for filename in filenames:
            filepath = os.path.join(dirpath, filename)
            try:
                # Use os.path.realpath to follow symlinks
                real_path = os.path.realpath(filepath)
                if not os.path.isfile(real_path):
                    continue

                file_size = os.path.getsize(real_path)
                if file_size > 0: # Ignore empty files
                    size_groups[file_size].append(real_path)
            except FileNotFoundError:
                continue # Skip broken links

    # --- 3. Group by content hash (slower, accurate check) ---
    hash_groups = defaultdict(list)
    total_files_checked = 0

    print("Checking for duplicate content...")
    for size, files in size_groups.items():
        if len(files) < 2:
            total_files_checked += 1
            continue # Not a duplicate if only one file of this size

        for filepath in files:
            file_hash = get_file_hash(filepath)
            if file_hash:
                hash_groups[file_hash].append(filepath)
            total_files_checked += 1

    # --- 4. Find oldest and move the rest ---
    total_moved = 0
    total_kept = 0

    print("\n--- Processing Duplicates ---")

    for file_hash, files in hash_groups.items():
        if len(files) < 2:
            total_kept += 1
            continue # Unique file

        # We have true duplicates. Now, find the oldest.
        # Create a list of (modification_time, filepath) tuples
        file_times = []
        for filepath in files:
            try:
                mtime = os.path.getmtime(filepath)
                file_times.append((mtime, filepath))
            except FileNotFoundError:
                continue

        # Sort by time (oldest first)
        file_times.sort(key=lambda x: x[0])

        # Keep the first file (the oldest)
        file_to_keep = file_times[0][1]
        print(f"\nGroup (Hash: {file_hash[:10]}...):")
        print(f"  Keeping (Oldest): {file_to_keep}")
        total_kept += 1

        # Move all the others
        dupes_to_move = file_times[1:]
        for mtime, filepath in dupes_to_move:
            try:
                filename = os.path.basename(filepath)
                dest_path = os.path.join(dupes_dir, filename)

                # Handle name conflicts in the 'dupeshit' folder
                counter = 1
                while os.path.exists(dest_path):
                    name, ext = os.path.splitext(filename)
                    dest_path = os.path.join(dupes_dir, f"{name}_{counter}{ext}")
                    counter += 1

                shutil.move(filepath, dest_path)
                print(f"  Moved:            {filepath}")
                total_moved += 1
            except Exception as e:
                print(f"  Error moving {filepath}: {e}")

    print("\n--- Summary ---")
    print(f"Total Files Scanned: {total_files_checked}")
    print(f"Unique Files Kept:   {total_kept}")
    print(f"Duplicates Moved:    {total_moved}")

if __name__ == "__main__":
    find_and_move_duplicates()

In [None]:
# @title then run this to see sizes of files and store in nnnn.txt
!ls -ls > nnnn.txt

In [None]:
# @title sort the list by the file count (the first item in our tuple



import os

search_path = "/content/drive/MyDrive/main_shit"
dir_file_counts = []

# os.walk goes through every directory and subdirectory
# For each directory, it gives us the (dirpath, list_of_subdirs, list_of_files)
for dirpath, dirnames, filenames in os.walk(search_path, topdown=True):

    # We just need the count of files in the current directory
    file_count = len(filenames)
    dir_file_counts.append((file_count, dirpath))

# Sort the list by the file count (the first item in our tuple)
dir_file_counts.sort(key=lambda item: item[0])

# Print the sorted results
print("--- File Counts per Directory ---")
for count, path in dir_file_counts:
    print(f"{count} {path}")

scan for (1) (2)


In [None]:
# @title
import re
import os
import shutil

def move_duplicates_by_filename(file_list_path, dest_folder="mddupes"):
    """
    Reads a text file containing a list of filenames and moves any
    file with a ' (n)' suffix to the destination folder.

    This script *only* looks at the filename pattern.
    """

    # --- 1. Setup ---
    try:
        os.makedirs(dest_folder, exist_ok=True)
        print(f"Using destination folder: '{dest_folder}/'\n")
    except Exception as e:
        print(f"Error creating directory {dest_folder}: {e}")
        return

    # Regex to find suffixes like ' (1)', ' (2)', etc.
    # This will match the file to be moved.
    suffix_regex = re.compile(r'\s\(\d+\)')

    total_moved = 0
    total_skipped = 0
    total_errors = 0

    print("--- Scanning for files with ' (n)' suffixes to move ---")

    # --- 2. Read file list and move files ---
    try:
        with open(file_list_path, 'r') as f:
            for line in f:
                # Get the filename from the line.
                # We'll split the line and take the last part.
                parts = line.strip().split()
                if len(parts) < 2: # Skip 'total' or blank lines
                    continue

                # The filename is everything from the 8th part on
                # (after the time '19:53')
                filename = " ".join(parts[8:])

                if not filename or filename == "mddupes":
                    continue

                # Check if the filename has the ' (n)' pattern
                name_part, extension = os.path.splitext(filename)
                if suffix_regex.search(name_part):
                    # This is a file to move
                    try:
                        source_path = filename
                        dest_path = os.path.join(dest_folder, filename)

                        if os.path.exists(source_path):
                            shutil.move(source_path, dest_path)
                            print(f"Moved:   {filename}")
                            total_moved += 1
                        else:
                            print(f"Skipped: {filename} (File not found)")
                            total_skipped += 1

                    except Exception as e:
                        print(f"Error moving {filename}: {e}")
                        total_errors += 1
                else:
                    # This is an original file, we skip it
                    total_skipped += 1

    except FileNotFoundError:
        print(f"Error: The file list '{file_list_path}' was not found.")
        return
    except Exception as e:
        print(f"An error occurred reading the list file: {e}")
        return

    print("\n--- Summary ---")
    print(f"Files Moved:   {total_moved}")
    print(f"Files Skipped: {total_skipped} (Originals or not found)")
    print(f"Errors:        {total_errors}")

if __name__ == "__main__":
    # Use the new file list
    move_duplicates_by_filename('nnnn.txt')

Reads an 'ls -ls' list, confirms true duplicates by hash,
    keeps the first file (alphabetically), and moves the rest
    to the destination folder.

In [None]:
# @title
import re
from collections import defaultdict
import os
import hashlib
import shutil

def get_file_hash(filepath):
    """Calculates the SHA-256 hash of a file."""
    sha256_hash = hashlib.sha256()
    try:
        with open(filepath, "rb") as f:
            for byte_block in iter(lambda: f.read(4096), b""):
                sha256_hash.update(byte_block)
        return sha256_hash.hexdigest()
    except FileNotFoundError:
        # File might have been moved by a previous step
        return None
    except Exception as e:
        print(f"Error reading {filepath}: {e}")
        return None

def verify_and_move_duplicates(file_list_path, dest_folder="mddupes"):
    """
    Reads an 'ls -ls' list, confirms true duplicates by hash,
    keeps the first file (alphabetically), and moves the rest
    to the destination folder.
    """

    # --- 1. Setup ---
    try:
        os.makedirs(dest_folder, exist_ok=True)
        print(f"Using destination folder: '{dest_folder}/'\n")
    except Exception as e:
        print(f"Error creating directory {dest_folder}: {e}")
        return

    # Regex to parse 'ls -ls' output
    line_regex = re.compile(
        r'^\s*(\d+)\s+([\-drwx]+)\s+(\d+)\s+([\w\d]+)\s+([\w\d]+)\s+(\d+)\s+([\w\s\d\:]+)\s+(.*)$'
    )

    size_groups = defaultdict(list)
    zero_byte_files = []

    total_moved = 0
    total_kept = 0
    total_errors = 0

    # --- 2. Parse the file list and group by size ---
    try:
        with open(file_list_path, 'r') as f:
            for line in f:
                line = line.strip()
                if not line or line.startswith('total'):
                    continue

                match = line_regex.match(line)
                if match:
                    try:
                        permissions = match.group(2)
                        byte_size = int(match.group(6))
                        filename = match.group(8)

                        if permissions.startswith('d'):
                            continue # Skip directories
                        elif byte_size == 0:
                            zero_byte_files.append(filename)
                        else:
                            size_groups[byte_size].append(filename)
                    except:
                        continue # Ignore malformed lines

    except FileNotFoundError:
        print(f"Error: The file '{file_list_path}' was not found.")
        return

    print("--- Verifying file content and moving duplicates ---")

    # --- 3. Verify by hash and move duplicates ---
    sorted_groups = sorted(size_groups.items(), key=lambda item: item[0])

    for byte_size, filenames in sorted_groups:
        # We only need to hash groups with potential dupes
        if len(filenames) < 2:
            if filenames:
                total_kept += 1
            continue

        # Group files in this size-group by their content hash
        hash_groups = defaultdict(list)
        for name in filenames:
            file_hash = get_file_hash(name)
            if file_hash:
                hash_groups[file_hash].append(name)

        # Now process the hash groups
        for h, files in hash_groups.items():
            if len(files) > 1:
                # This is a TRUE duplicate group.
                # Sort alphabetically to pick the "original"
                files.sort()

                file_to_keep = files[0]
                dupes_to_move = files[1:]

                print(f"\n--- Group (Hash: {h[:10]}...) ---")
                print(f"  Keeping: {file_to_keep}")
                total_kept += 1

                for dupe in dupes_to_move:
                    try:
                        source_path = dupe
                        dest_path = os.path.join(dest_folder, dupe)

                        # Check if file still exists before moving
                        if os.path.exists(source_path):
                            shutil.move(source_path, dest_path)
                            print(f"  Moved:   {dupe}")
                            total_moved += 1
                        else:
                            print(f"  Skipped: {dupe} (already moved or missing)")

                    except Exception as e:
                        print(f"  Error moving {dupe}: {e}")
                        total_errors += 1

            elif len(files) == 1:
                # This was a false positive (same size, diff content)
                # print(f"Keeping {files[0]} (unique content)")
                total_kept += 1

    # --- 4. Report on Zero-Byte Files ---
    if zero_byte_files:
        print("\n--- Zero-Byte Files (Ignored) ---")
        for f in sorted(zero_byte_files):
            print(f"  - {f}")
        total_kept += len(zero_byte_files)

    print("\n--- Summary ---")
    print(f"Files Moved:   {total_moved}")
    print(f"Files Kept:    {total_kept}")
    print(f"Errors:        {total_errors}")

if __name__ == "__main__":
    # This script reads the 'ls -ls' output, groups by size,
    # calculates a hash to confirm true duplicates,
    # keeps one file, and moves the others.
    verify_and_move_duplicates('sizeaftersnake.txt')