## SCRIPT A: (Upgraded) FILE-to-TEXT CONVERTER

This script is NOW UPGRADED to read `.md` files.

It will:
1. Install libraries (PyMuPDF, python-docx, tqdm).
2. Scan your source folder(s) for `.pdf`, `.doc`, `.docx`, and `.md` files.
3. Create a pool of worker processes to convert all files in parallel.
4. Save a log of all unsupported/errored files.

In [3]:
# @title SCRIPT A: (Upgraded) FILE-to-TEXT CONVERTER
#
# This script is NOW UPGRADED to read .md files.
#
# It will:
# 1. Install libraries (PyMuPDF, python-docx, tqdm).
# 2. Scan your source folder(s) for .pdf, .doc, .docx, and .md files.
# 3. Create a pool of worker processes to convert all files in parallel.
# 4. Save a log of all unsupported/errored files.

print("--- Step 0: Installing Text Extraction Libraries ---")
!apt-get install -y antiword poppler-utils
!pip install PyMuPDF python-docx tqdm -q
print("Libraries (PyMuPDF, python-docx, tqdm) installed.")

import os
import time
import fitz  # PyMuPDF
import docx # python-docx
import subprocess # For running antiword
from google.colab import drive
from concurrent.futures import ProcessPoolExecutor, as_completed
from tqdm.auto import tqdm # Progress bar

# ------------------------------------------------------------------
#  Processor Function (This runs on separate CPU cores)
# ------------------------------------------------------------------

def process_file(source_path, target_path, file_ext_lower):
    """
    Extracts text from a single file. This function is designed
    to be run in a separate process.
    Returns (source_path, "SUCCESS") or (source_path, "ERROR: message")
    """
    try:
        full_text = ""

        if file_ext_lower == '.pdf':
            with fitz.open(source_path) as doc:
                for page in doc:
                    full_text += page.get_text()

        elif file_ext_lower == '.docx':
            doc = docx.Document(source_path)
            for para in doc.paragraphs:
                full_text += para.text + '\n'

        elif file_ext_lower == '.doc':
            result = subprocess.run(
                ['antiword', source_path],
                capture_output=True,
                text=True,
                encoding='utf-8',
                errors='ignore'
            )
            if result.returncode == 0:
                full_text = result.stdout
            else:
                raise Exception(f"Antiword error: {result.stderr}")

        # --- THIS IS THE FIX ---
        # Treat .md files just like .txt files
        elif file_ext_lower == '.txt' or file_ext_lower == '.md':
            with open(source_path, 'r', encoding='utf-8', errors='ignore') as f:
                full_text = f.read()
        # ---------------------

        # Write the extracted text to the new .txt file
        with open(target_path, 'w', encoding='utf-8') as f_out:
            f_out.write(full_text)

        return (source_path, "SUCCESS")

    except Exception as e:
        return (source_path, f"ERROR: {e}")

# ------------------------------------------------------------------
# Configuration
# ------------------------------------------------------------------
print("\n--- Configuring Paths ---")

try:
    drive.mount('/content/drive', force_remount=True)
    print("Google Drive mounted.")
except Exception as e:
    print(f"Error mounting drive: {e}")
    raise e

# --- !!! CHANGE THIS !!! ---
# Point this to the folder with your .md files and converted .gdoc files
SOURCE_DIR = '/content/drive/MyDrive/main_shit'

# This is the NEW MAIN FOLDER where all outputs for this project will go
MAIN_OUTPUT_DIRECTORY = '/content/drive/MyDrive/Main_Shit_Output'
# -------------------------

TEXT_OUTPUT_DIR = os.path.join(MAIN_OUTPUT_DIRECTORY, 'Text_Files')
STATE_DIRECTORY = os.path.join(MAIN_OUTPUT_DIRECTORY, 'Script_State')
UNSUPPORTED_LOG_FILE = os.path.join(STATE_DIRECTORY, 'unsupported_files.log')

os.makedirs(TEXT_OUTPUT_DIR, exist_ok=True)
os.makedirs(STATE_DIRECTORY, exist_ok=True)

print(f"Reading all files from: {SOURCE_DIR}")
print(f"Saving new .txt files to: {TEXT_OUTPUT_DIR}")
print(f"Saving unsupported file log to: {UNSUPPORTED_LOG_FILE}")

# --- THIS IS THE FIX ---
# Added '.md' to the list of files we process
SUPPORTED_EXTENSIONS = ('.pdf', '.txt', '.doc', '.docx', '.md')
# ---------------------

# ------------------------------------------------------------------
# Step 1: Build Job List
# ------------------------------------------------------------------
print("\n--- Scanning file system to build job list ---")
start_time = time.time()
jobs_to_process = []
unsupported_file_paths = []
total_scanned = 0
skipped_exist = 0
unsupported_skipped = 0

for dirpath, dirnames, filenames in os.walk(SOURCE_DIR):
    for filename in filenames:
        total_scanned += 1
        source_file_path = os.path.join(dirpath, filename)

        # Get the file extension
        file_ext_lower = os.path.splitext(filename.lower())[1]

        # Check if the file is one we can process
        if not file_ext_lower in SUPPORTED_EXTENSIONS:
            unsupported_skipped += 1
            if not source_file_path.endswith('.gdoc'): # Don't log gdocs
                unsupported_file_paths.append(source_file_path)
            continue

        # Create the matching directory structure
        relative_dir = os.path.relpath(dirpath, SOURCE_DIR)
        target_dir = os.path.join(TEXT_OUTPUT_DIR, relative_dir)
        os.makedirs(target_dir, exist_ok=True)

        # Create the new .txt filename
        target_txt_path = os.path.join(target_dir, f"{filename}.txt")

        # --- Resumability: Skip files we've already converted ---
        if os.path.exists(target_txt_path):
            skipped_exist += 1
            continue

        # Add to our "to-do" list
        jobs_to_process.append((source_file_path, target_txt_path, file_ext_lower))

print(f"Scan complete in {time.time() - start_time:.2f} seconds.")
print(f"Total files found: {total_scanned}")
print(f"Skipped (already exist): {skipped_exist}")
print(f"Skipped (unsupported, e.g., .gdoc, .py): {unsupported_skipped}")
print(f"New files to convert: {len(jobs_to_process)}")

# ------------------------------------------------------------------
# Step 2: Run Jobs in Parallel
# ------------------------------------------------------------------
print(f"\n--- Starting parallel conversion of {len(jobs_to_process)} files ---")
start_time = time.time()
converted_count = 0
error_count = 0

# Use max_workers=None to use all available CPU cores
with ProcessPoolExecutor() as executor:
    futures = {
        executor.submit(process_file, src, tgt, ext): src
        for (src, tgt, ext) in jobs_to_process
    }

    for future in tqdm(as_completed(futures), total=len(jobs_to_process)):
        source_path, result = future.result()
        if result == "SUCCESS":
            converted_count += 1
        else:
            # It failed, log it
            error_count += 1
            unsupported_file_paths.append(f"ERROR_PROCESSING: {source_path} | {result}")

print(f"\nParallel processing complete in {time.time() - start_time:.2f} seconds.")

# ------------------------------------------------------------------
# Step 3: Final Logging and Summary
# ------------------------------------------------------------------
print(f"\nWriting unsupported file log...")
try:
    with open(UNSUPPORTED_LOG_FILE, 'w', encoding='utf-8') as f:
        f.write(f"Total unsupported or errored files: {len(unsupported_file_paths)}\n")
        f.write("----------------------------------------\n")
        for path in unsupported_file_paths:
            f.write(f"{path}\n")
    print(f"Saved log of {len(unsupported_file_paths)} unsupported/errored files to: {UNSUPPORTED_LOG_FILE}")
except Exception as e:
    print(f"  ERROR: Could not write unsupported file log. {e}")


print(f"--- File-to-Text Conversion Complete ---")
print(f"Successfully converted: {converted_count} new files.")
print(f"Files that failed processing: {error_count}")
print(f"Skipped (already exist): {skipped_exist}")
print(f"Skipped (unsupported): {unsupported_skipped}")
print(f"\nYour new .txt files are ready in: {TEXT_OUTPUT_DIR}")

## SCRIPT B: (NEW) FUZZY HASH DEDUPLICATOR

Run this script AFTER the file-to-text conversion (Script A)
and BEFORE the database builder (Script C).

It will:
1. Install 'ssdeep' (a fuzzy-hashing library).
2. Scan all .txt files in your 'Text_Files' directory.
3. Generate a "fuzzy hash" for every file.
4. Compare every file to every other file (with a progress bar).
5. If two files are > 98% similar, it moves one to a 'Duplicates_Removed' folder.

In [2]:
# @title SCRIPT B: (NEW) FUZZY HASH DEDUPLICATOR
#
# Run this script AFTER the file-to-text conversion (Script A)
# and BEFORE the database builder (Script C).
#
# It will:
# 1. Install 'ssdeep' (a fuzzy-hashing library).
# 2. Scan all .txt files in your 'Text_Files' directory.
# 3. Generate a "fuzzy hash" for every file.
# 4. Compare every file to every other file (with a progress bar).
# 5. If two files are > 98% similar, it moves one to a 'Duplicates_Removed' folder.

print("--- Step 0: Installing Fuzzy Hash Library (ssdeep) ---")
# ssdeep needs the C-library libfuzzy-dev
!apt-get install -y libfuzzy-dev
!pip install ssdeep tqdm -q
print("Libraries (ssdeep, tqdm) installed.")

import os
import shutil
import ssdeep
import time
from google.colab import drive
from tqdm.auto import tqdm

# ------------------------------------------------------------------
# Configuration
# ------------------------------------------------------------------
print("\n--- Configuring Paths ---")

try:
    drive.mount('/content/drive', force_remount=True)
    print("Google Drive mounted.")
except Exception as e:
    print(f"Error mounting drive: {e}")
    raise e

# --- !!! VERIFY THESE PATHS !!! ---
# This must match the output of your LAST script (Script A)
MAIN_OUTPUT_DIRECTORY = '/content/drive/MyDrive/Main_Shit_Output'
# --------------------------------

# This is the folder of .txt files we will scan
TEXT_SOURCE_DIR = os.path.join(MAIN_OUTPUT_DIRECTORY, 'Text_Files')

# This is where we will move the duplicates
DUPLICATE_DIR = os.path.join(MAIN_OUTPUT_DIRECTORY, 'Duplicates_Removed')

# This is where we log our findings
STATE_DIRECTORY = os.path.join(MAIN_OUTPUT_DIRECTORY, 'Script_State')
DEDUPE_LOG_FILE = os.path.join(STATE_DIRECTORY, 'deduplication_log.txt')

# Similarity threshold (98% = a near-perfect match)
SIMILARITY_THRESHOLD = 98

os.makedirs(DUPLICATE_DIR, exist_ok=True)
os.makedirs(STATE_DIRECTORY, exist_ok=True)

print(f"Scanning: {TEXT_SOURCE_DIR}")
print(f"Moving duplicates to: {DUPLICATE_DIR}")
print(f"Saving log to: {DEDUPE_LOG_FILE}")

# ------------------------------------------------------------------
# Step 1: Scan and Hash All Files
# ------------------------------------------------------------------
print("\n--- Phase 1: Scanning and Hashing Files ---")
start_time = time.time()
file_hashes = {} # Dictionary: {filepath: hash}
files_to_scan = []

# First, get a list of all .txt files
for dirpath, dirnames, filenames in os.walk(TEXT_SOURCE_DIR):
    for filename in filenames:
        if filename.endswith(".txt"):
            files_to_scan.append(os.path.join(dirpath, filename))

print(f"Found {len(files_to_scan)} text files to hash.")

# Now, hash them (with a progress bar)
for path in tqdm(files_to_scan, desc="Hashing files"):
    try:
        # ssdeep.hash_from_file is fast
        file_hash = ssdeep.hash_from_file(path)
        if file_hash:
            file_hashes[path] = file_hash
    except Exception as e:
        print(f"  ERROR: Could not hash {path}. {e}")

print(f"Hashing complete in {time.time() - start_time:.2f} seconds.")

# ------------------------------------------------------------------
# Step 2: Compare Hashes and Find Duplicates
# ------------------------------------------------------------------
print("\n--- Phase 2: Comparing Hashes (this may take time) ---")
start_time = time.time()

# Convert to a list for indexed comparison
file_list = list(file_hashes.items())
files_to_move = set()
log_entries = []

# O(n^2) comparison loop
# This is slow, but necessary. We use tqdm to show progress.
for i in tqdm(range(len(file_list)), desc="Comparing files"):
    path1, hash1 = file_list[i]

    # If this file is already marked as a dupe, skip its comparisons
    if path1 in files_to_move:
        continue

    for j in range(i + 1, len(file_list)):
        path2, hash2 = file_list[j]

        # Don't compare a file that's already a known dupe
        if path2 in files_to_move:
            continue

        try:
            similarity = ssdeep.compare(hash1, hash2)

            if similarity > SIMILARITY_THRESHOLD:
                # We found a match!
                # We will keep path1 and move path2.
                files_to_move.add(path2)

                # Log the action
                log_entry = f"Match ({similarity}%): Kept '{path1}' | Moved '{path2}'"
                log_entries.append(log_entry)

        except Exception as e:
            # Handle rare error in ssdeep.compare
            pass

print(f"Comparison complete in {time.time() - start_time:.2f} seconds.")
print(f"Found {len(files_to_move)} duplicate files to remove.")

# ------------------------------------------------------------------
# Step 3: Move Duplicates and Write Log
# ------------------------------------------------------------------
print("\n--- Phase 3: Moving Duplicates and Writing Log ---")

# Move all the files marked for deletion
for file_path in files_to_move:
    try:
        # Create the same folder structure in the duplicate dir
        relative_path = os.path.relpath(file_path, TEXT_SOURCE_DIR)
        target_path = os.path.join(DUPLICATE_DIR, relative_path)
        os.makedirs(os.path.dirname(target_path), exist_ok=True)

        # Move the file
        shutil.move(file_path, target_path)
    except Exception as e:
        log_entries.append(f"ERROR_MOVE: Could not move '{file_path}'. {e}")

# Write the final log file
try:
    with open(DEDUPE_LOG_FILE, 'w', encoding='utf-8') as f:
        f.write(f"Deduplication Report\n")
        f.write(f"Threshold: {SIMILARITY_THRESHOLD}%\n")
        f.write(f"Total duplicates removed: {len(files_to_move)}\n")
        f.write("----------------------------------------\n")
        for entry in log_entries:
            f.write(f"{entry}\n")
    print(f"Successfully wrote log to: {DEDUPE_LOG_FILE}")
except Exception as e:
    print(f"  ERROR: Could not write log file. {e}")

print("\n--- Deduplication Complete ---")
print(f"Your 'Text_Files' folder is now clean.")
print(f"You can now run Script C (the database builder).")

## SCRIPT C: TEXT-to-DATABASE (for "Main Shit")

Run this script SECOND.

It will:
1. Install dependencies (`chromadb`, `sentence-transformers`).
2. Load the text files from the output of Script B.
3. Generate embeddings for each text file.
4. Build a ChromaDB vector database.
5. Copy the final database to Google Drive.

In [None]:
# @title SCRIPT C: TEXT-to-DATABASE (for "Main Shit")
#
# Run this script SECOND.

import os, re, shutil, time, pickle, sys
from google.colab import drive

# ------------------------------------------------------------------
# SETUP: Mount Drive
# ------------------------------------------------------------------
print("--- Running Setup: Mount Drive ---")
try:
    drive.mount('/content/drive', force_remount=True)
    print("Google Drive mounted successfully.")
except Exception as e:
    print(f"Error mounting drive: {e}"); raise e

# ------------------------------------------------------------------
# CONFIGURATION: Define All Directories
# ------------------------------------------------------------------
print("\n--- Running Configuration: Setting Paths ---")

# --- !!! THIS MUST MATCH THE OUTPUT FOLDER FROM SCRIPT A !!! ---
MAIN_OUTPUT_DIRECTORY = '/content/drive/MyDrive/Main_Shit_Output'
# ---------------------------------------------------------------

# --- This is the folder of .txt files you just created in Step 1 ---
SOURCE_DIRECTORY = os.path.join(MAIN_OUTPUT_DIRECTORY, 'Text_Files')

# --- All other paths will now be INSIDE the main output folder ---
STATE_DIRECTORY = os.path.join(MAIN_OUTPUT_DIRECTORY, 'Script_State')

# --- DB Build Paths (Local first, then GDrive) ---
LOCAL_DB_PATH = '/content/Main_Shit_Vector_Database'
FINAL_DB_PATH_ON_DRIVE = os.path.join(MAIN_OUTPUT_DIRECTORY, 'Vector_Database')
COLLECTION_NAME = "main_shit_archive"

# State file for the database log
PHASE4_LOG = os.path.join(STATE_DIRECTORY, 'db_processed_files.log')

print(f"Source (Input): {SOURCE_DIRECTORY}")
print(f"State Files: {STATE_DIRECTORY}")
print(f"Local DB Build Path: {LOCAL_DB_PATH}")
print(f"Final DB GDrive Path: {FINAL_DB_PATH_ON_DRIVE}")

os.makedirs(STATE_DIRECTORY, exist_ok=True)
os.makedirs(FINAL_DB_PATH_ON_DRIVE, exist_ok=True)

# ------------------------------------------------------------------
# Phase 1: Install DB Dependencies
# ------------------------------------------------------------------
print("\n--- Phase 1: Installing Database Dependencies ---")
try:
    import chromadb
    import sentence_transformers
    print("ChromaDB and SentenceTransformers are already installed.")
except ImportError:
    print("Installing chromadb and sentence-transformers...")
    process = os.popen('pip install chromadb sentence-transformers -q')
    process.read(); process.close()
    print("Installation complete.")

import chromadb
from sentence_transformers import SentenceTransformer

# ------------------------------------------------------------------
# Phase 2: Create Vector Database (Resumable)
# ------------------------------------------------------------------
print("\n--- Phase 2: Building Vector Database ---")

print("Loading processing state for Database...")
processed_files_p4 = set()
try:
    if os.path.exists(PHASE4_LOG):
        with open(PHASE4_LOG, 'r', encoding='utf-8') as f:
            processed_files_p4 = set(f.read().splitlines())
        print(f"Loaded {len(processed_files_p4)} processed file records for DB.")
except Exception as e:
    print(f"Could not load DB state, starting fresh. Error: {e}")

print("Loading SentenceTransformer model (all-MiniLM-L6-v2)...")
model = SentenceTransformer('all-MiniLM-L6-v2')
print("Model loaded.")

client = chromadb.PersistentClient(path=LOCAL_DB_PATH)
collection = client.get_or_create_collection(name=COLLECTION_NAME)
print(f"Database client running. Collection '{COLLECTION_NAME}' has {collection.count()} documents.")

print(f"Starting embedding process for {SOURCE_DIRECTORY}...")
batch_documents = []; batch_metadatas = []; batch_ids = []
BATCH_SIZE = 50
total_scanned_p4 = 0; new_added_p4 = 0
start_time_p4 = time.time(); last_heartbeat_p4 = time.time()

try:
    with open(PHASE4_LOG, 'a', encoding='utf-8') as f_log:
        for dirpath, dirnames, filenames in os.walk(SOURCE_DIRECTORY):
            for filename in filenames:
                if not filename.endswith(".txt"): continue
                total_scanned_p4 += 1
                file_path = os.path.join(dirpath, filename)
                if file_path in processed_files_p4: continue

                current_time = time.time()
                if current_time - last_heartbeat_p4 > 30:
                    print(f"\r  ... Phase 2 Heartbeat: Scanned {total_scanned_p4} files. Added {new_added_p4} new documents. Processing: {filename} \033[K", end='')
                    last_heartbeat_p4 = current_time

                try:
                    with open(file_path, 'r', encoding='utf-8', errors='ignore') as f:
                        content = f.read()
                except Exception as e:
                    print(f"\nERROR: Could not read {file_path}. {e}"); continue

                if not content.strip():
                    f_log.write(file_path + '\n'); processed_files_p4.add(file_path)
                    continue

                batch_documents.append(content)
                relative_path = os.path.relpath(file_path, SOURCE_DIRECTORY)
                batch_metadatas.append({"source": relative_path})
                batch_ids.append(file_path)

                if len(batch_ids) >= BATCH_SIZE:
                    embeddings = model.encode(batch_documents).tolist()
                    collection.add(embeddings=embeddings, documents=batch_documents, metadatas=batch_metadatas, ids=batch_ids)
                    for id in batch_ids: f_log.write(id + '\n')
                    processed_files_p4.update(batch_ids)
                    new_added_p4 += len(batch_ids)
                    print(f"\n... Added batch of {len(batch_ids)}. Total new documents: {new_added_p4} ...")
                    batch_documents, batch_metadatas, batch_ids = [], [], []

    if batch_ids:
        print(f"\n... Adding final batch of {len(batch_ids)} files ...")
        embeddings = model.encode(batch_documents).tolist()
        collection.add(embeddings=embeddings, documents=batch_documents, metadatas=batch_metadatas, ids=batch_ids)
        for id in batch_ids: f_log.write(id + '\n')
        new_added_p4 += len(batch_ids)


except Exception as e:
    print(f"\n--- An error occurred during embedding: {e} ---")
finally:
    print()
    print("\n--- Phase 2 Embedding finished ---")
    print(f"Time taken: {time.time() - start_time_p4:.2f} seconds.")
    print(f"Total new files added to DB: {new_added_p4}")
    skipped_p4 = total_scanned_p4 - new_added_p4
    print(f"Total files skipped (already in DB): {skipped_p4}")
    print(f"\nDatabase now contains {collection.count()} total documents.")
    print(f"Database is built locally at: {LOCAL_DB_PATH}")

# ------------------------------------------------------------------
# Phase 3: Test Query (On Local DB)
# ------------------------------------------------------------------
print("\n\n--- Phase 3: Running a Test Query ---")

if collection.count() > 0:
    query_text = "how to build a custom gpt"
    print(f"Query: '{query_text}'")

    query_embedding = model.encode([query_text]).tolist()
    results = collection.query(query_embeddings=query_embedding, n_results=3, include=['documents', 'metadatas', 'distances'])

    print("--- Top 3 Results ---")
    if 'ids' in results and results['ids'][0]:
        for i in range(len(results['ids'][0])):
            distance = results['distances'][0][i]
            metadata = results['metadatas'][0][i]
            document = results['documents'][0][i][:350] + "..."

            print(f"\nResult {i+1} (Distance: {distance:.4f})")
            print(f"  Source: {metadata.get('source', 'N/A')}")
            print(f"  Content: {document}")
    else:
        print("No results found.")
else:
    print("Database is empty. No query performed.")

# ------------------------------------------------------------------
# Phase 4: Copy Finished DB to Google Drive
# ------------------------------------------------------------------
print("\n\n--- Phase 4: Moving Database to Google Drive ---")
print(f"Moving local DB from: {LOCAL_DB_PATH}")
print(f"                to: {FINAL_DB_PATH_ON_DRIVE}")

try:
    # Check if the local database directory exists before attempting to move it
    if os.path.exists(LOCAL_DB_PATH):
        if os.path.exists(FINAL_DB_PATH_ON_DRIVE):
            print("Removing empty placeholder folder from Google Drive...")
            shutil.rmtree(FINAL_DB_PATH_ON_DRIVE)

        !mv {LOCAL_DB_PATH} {FINAL_DB_PATH_ON_DRIVE}

        print("\n--- Move complete! ---")
        print(f"Your persistent database is now saved at: {FINAL_DB_PATH_ON_DRIVE}")
    else:
        print("\n--- Skipping move: Local database not found ---")
        print(f"Local database was not created successfully at: {LOCAL_DB_PATH}")
except Exception as e:
    print(f"\n--- ERROR: Could not move database to Google Drive ---")
    print(f"Error: {e}")
    print(f"Your database is still safe on the local Colab disk at: {LOCAL_DB_PATH}")
    print("You can manually copy it using the file explorer.")

print("\n--- TEXT-TO-DATABASE SCRIPT FINISHED ---")