In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [1]:
# @title
import os
import glob
import shutil
import uuid
import json
import numpy as np
from google.colab import drive
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# -------------------------------------------------------------------
# --- SETTINGS ---
# --- Please edit these variables before running ---
# -------------------------------------------------------------------

# 1. Path to the folder you want to scan
search_directory = "/content/drive/MyDrive/---- playground -----/Documents/converted_from_docx_to_md"

# 2. Path to store cache files (for resuming progress)
cache_directory = "/content/drive/MyDrive//.md_analyzer_cache"

# 3. Force rescan: Set to True to ignore cache and rescan all files
force_rescan = False

# 4. Force recalculate: Set to True to ignore cached similarity matrix
force_recalculate = False

# 5. Report threshold: Show all file pairs with similarity >= this value (0.8 = 80%)
similarity_threshold = 0.8

# 6. Set to True to run the (optional) organization step
run_organization = False

# 7. Move threshold: If run_organization=True, move files with similarity >= this (0.95 = 95%)
move_threshold = 0.95

# 8. Duplicates folder: If run_organization=True, move duplicates here
duplicates_folder = "/content/drive/MyDrive/Duplicates_To_Review"

# -------------------------------------------------------------------
# --- SCRIPT START ---
# -------------------------------------------------------------------
print("--- Step 1: Mounting Google Drive ---")
try:
    drive.mount('/content/drive')
except Exception as e:
    print(f"Error mounting drive: {e}")
    # Stop execution if drive mount fails
    raise

print("\n--- Step 2: Checking Settings ---")
print(f"Looking for .md files in: {search_directory}")
print(f"Cache will be stored in: {cache_directory}")
print(f"Report threshold set to: {similarity_threshold * 100}%")
if run_organization:
    print(f"Organization IS enabled.")
    print(f"Move threshold set to: {move_threshold * 100}%")
    print(f"Duplicates will be moved to: {duplicates_folder}")
else:
    print("Organization IS NOT enabled.")


# --- Step 3: Find, Read, & Cache Markdown Files ---
print("\n--- Step 3: Finding and Reading Files ---")

# Define cache file paths
file_list_cache = os.path.join(cache_directory, "file_paths.json")
file_contents_cache = os.path.join(cache_directory, "file_contents.json")

def find_markdown_files(directory):
    """Finds all .md files recursively in a given directory."""
    return glob.glob(os.path.join(directory, "**", "*.md"), recursive=True)

# --- Caching Logic (Resumability) ---
if not force_rescan and os.path.exists(file_list_cache) and os.path.exists(file_contents_cache):
    print(f"Loading file lists from cache: {cache_directory}")
    try:
        with open(file_list_cache, 'r') as f:
            file_paths = json.load(f)
        with open(file_contents_cache, 'r') as f:
            file_contents = json.load(f)
        print(f"Successfully loaded {len(file_paths)} files from cache.")
    except Exception as e:
        print(f"Error loading cache: {e}. Rescanning...")
        force_rescan = True
else:
    print("No valid cache found or rescan forced. Starting new file scan...")
    force_rescan = True # Ensure we run the block below

if force_rescan:
    # --- Error Handling: Check if directory exists ---
    if not os.path.isdir(search_directory):
        print(f"\n--- ERROR: Directory not found ---")
        print(f"The path '{search_directory}' does not exist or is not a directory.")
        print("Please check your 'search_directory' path at the top of the script and try again.")
        raise FileNotFoundError(f"Directory not found: {search_directory}")
    else:
        print(f"Starting file search in: {search_directory}")
        all_file_paths = find_markdown_files(search_directory)
        print(f"Found {len(all_file_paths)} markdown files.")

        file_contents = []
        file_paths = [] # Use this new list for valid, read files

        if not all_file_paths:
            print("\nWarning: No .md files found.")
        else:
            print("\nReading file contents...")
            for i, f_path in enumerate(all_file_paths):
                try:
                    with open(f_path, 'r', encoding='utf-8') as f:
                        file_contents.append(f.read())
                        file_paths.append(f_path) # Only add path if read is successful

                    # --- Heartbeat Status Update ---
                    if (i + 1) % 100 == 0 or (i + 1) == len(all_file_paths):
                        print(f"  ...Processed {i + 1}/{len(all_file_paths)} files")

                except Exception as e:
                    print(f"  Could not read {f_path}: {e}")

            print(f"\nSuccessfully read {len(file_contents)} files.")

            # --- Caching Logic: Save results ---
            try:
                os.makedirs(cache_directory, exist_ok=True)
                print(f"Saving file list and contents to cache: {cache_directory}")
                with open(file_list_cache, 'w') as f:
                    json.dump(file_paths, f)
                with open(file_contents_cache, 'w') as f:
                    json.dump(file_contents, f)
                print("Cache save complete.")
            except Exception as e:
                print(f"--- Warning: Could not save cache ---")
                print(f"Error: {e}")
                print("This will not stop the script, but progress will not be resumable.")


# --- Step 4: Calculate Similarity (with Caching) ---
print("\n--- Step 4: Calculating Similarity ---")

similarity_matrix = []
matrix_cache_path = os.path.join(cache_directory, "similarity_matrix.npy")

# --- Graceful Exiting ---
if 'file_contents' not in locals() or len(file_contents) < 2:
    print("Not enough files to compare (need at least 2). Stopping.")
    print("Please run Step 3 successfully first.")
else:
    # --- Caching Logic (Resumability) ---
    if not force_recalculate and os.path.exists(matrix_cache_path):
        try:
            print(f"Loading similarity matrix from cache: {matrix_cache_path}")
            similarity_matrix = np.load(matrix_cache_path)
            print("Successfully loaded matrix.")
            # Sanity check
            if similarity_matrix.shape[0] != len(file_paths):
                print(f"Cache-File mismatch! (Matrix: {similarity_matrix.shape[0]}, Files: {len(file_paths)})")
                print("Forcing recalculation...")
                force_recalculate = True
            else:
                 print("\nCalculation complete (loaded from cache).")
        except Exception as e:
            print(f"Error loading matrix cache: {e}. Recalculating...")
            force_recalculate = True
    else:
        print("No valid matrix cache found or recalculation forced.")
        force_recalculate = True # Ensure we run the block below

    if force_recalculate:
        if 'file_contents' not in locals() or len(file_contents) < 2:
             print("Error: file_contents not available. Please run Step 3.")
        else:
            # --- Heartbeat Status Update ---
            print("Calculating TF-IDF matrix... (This may take a while for many files)")
            vectorizer = TfidfVectorizer(stop_words='english')
            tfidf_matrix = vectorizer.fit_transform(file_contents)

            # --- Heartbeat Status Update ---
            print("Calculating cosine similarity matrix...")
            similarity_matrix = cosine_similarity(tfidf_matrix)

            print("\nCalculation complete.")

            # --- Caching Logic: Save results ---
            try:
                print(f"Saving similarity matrix to cache: {matrix_cache_path}")
                np.save(matrix_cache_path, similarity_matrix)
                print("Cache save complete.")
            except Exception as e:
                print(f"--- Warning: Could not save matrix cache ---")
                print(f"Error: {e}")


# --- Step 5: Generate Similarity Report ---
print(f"\n--- Step 5: Similarity Report (Threshold: {similarity_threshold * 100}%) ---")

similar_pairs = []

# --- Graceful Exiting ---
if 'similarity_matrix' not in locals() or len(similarity_matrix) == 0:
    print("Similarity matrix not calculated. Please run previous steps first.")
else:
    # We loop through the upper triangle of the matrix (to avoid (A,B) and (B,A) pairs)
    for i in range(len(similarity_matrix)):
        for j in range(i + 1, len(similarity_matrix)):
            similarity = similarity_matrix[i][j]

            if similarity >= similarity_threshold:
                print(f"\n[!] SIMILAR PAIR FOUND:")
                print(f"  File A: {file_paths[i]}")
                print(f"  File B: {file_paths[j]}")
                print(f"  Similarity: {similarity * 100:.2f}%")
                similar_pairs.append((file_paths[i], file_paths[j], similarity))

    if not similar_pairs:
        print("\nNo files found above the similarity threshold.")


# --- Step 6: Organize Similar Files (Optional) ---
print(f"\n--- Step 6: Organizing Similar Files ---")

if run_organization:
    # --- Graceful Exiting ---
    if 'similarity_matrix' not in locals() or len(similarity_matrix) == 0:
        print("Error: Similarity matrix not calculated. Please run Step 4 first.")
    else:
        # --- CRITICAL SAFETY CHECKS (Error Handling) ---
        search_dir_abs = os.path.abspath(search_directory)
        dupes_dir_abs = os.path.abspath(duplicates_folder)

        if search_dir_abs == dupes_dir_abs:
            print(f"\n--- FATAL ERROR: DANGEROUS PATH ---")
            print("Your 'duplicates_folder' is the SAME as your 'search_directory'.")
            print("Running this would delete files from the folder you are scanning.")
            print("Operation ABORTED. Please set a different 'duplicates_folder'.")
        elif dupes_dir_abs.startswith(search_dir_abs):
            print(f"\n--- FATAL ERROR: DANGEROUS PATH ---")
            print("Your 'duplicates_folder' is INSIDE your 'search_directory'.")
            print("This could cause major problems with future scans.")
            print("Operation ABORTED. Please set a different 'duplicates_folder' (outside of the search area).")
        else:
            # --- Path is safe, proceed with moving files ---
            if not os.path.exists(duplicates_folder):
                os.makedirs(duplicates_folder)
                print(f"Created directory: {duplicates_folder}")

            moved_files = set() # To avoid moving a file that appears in multiple pairs
            files_to_move = []

            # Find files to move based on the *new* threshold
            for i in range(len(similarity_matrix)):
                for j in range(i + 1, len(similarity_matrix)):
                    if similarity_matrix[i][j] >= move_threshold:
                        # Add the "B" file to be moved, as long as it hasn't been moved already
                        if file_paths[j] not in moved_files and file_paths[i] not in moved_files:
                            files_to_move.append(file_paths[j])
                            moved_files.add(file_paths[j]) # Mark it as 'to be moved'

            if not files_to_move:
                print(f"No files met the high threshold ({move_threshold * 100}%) for moving.")
            else:
                print(f"Moving {len(files_to_move)} files to {duplicates_folder}...")
                for file_path in files_to_move:
                    # --- Error Handling for file operations ---
                    try:
                        base_name = os.path.basename(file_path)
                        dest_path = os.path.join(duplicates_folder, base_name)

                        if os.path.exists(dest_path):
                            base, ext = os.path.splitext(base_name)
                            dest_path = os.path.join(duplicates_folder, f"{base}_{uuid.uuid4().hex[:6]}{ext}")

                        shutil.move(file_path, dest_path)
                        print(f"  MOVED: {file_path} \n    TO: {dest_path}")
                    except Exception as e:
                        print(f"  ERROR moving {file_path}: {e}")
                print("\nOrganization complete.")
else:
    print("Organization script was not run. (run_organization is False)")

print("\n--- SCRIPT FINISHED ---")



--- Step 1: Mounting Google Drive ---
Mounted at /content/drive

--- Step 2: Checking Settings ---
Looking for .md files in: /content/drive/MyDrive/---- playground -----/Documents/converted_from_docx_to_md
Cache will be stored in: /content/drive/MyDrive//.md_analyzer_cache
Report threshold set to: 80.0%
Organization IS NOT enabled.

--- Step 3: Finding and Reading Files ---
Loading file lists from cache: /content/drive/MyDrive//.md_analyzer_cache
Successfully loaded 313 files from cache.

--- Step 4: Calculating Similarity ---
Loading similarity matrix from cache: /content/drive/MyDrive//.md_analyzer_cache/similarity_matrix.npy
Successfully loaded matrix.

Calculation complete (loaded from cache).

--- Step 5: Similarity Report (Threshold: 80.0%) ---

[!] SIMILAR PAIR FOUND:
  File A: /content/drive/MyDrive/---- playground -----/Documents/converted_from_docx_to_md/dave_matthews_deep_research_prompt.md
  File B: /content/drive/MyDrive/---- playground -----/Documents/converted_from_docx_