In [1]:
# @title #1 install required packages
!pip install openai chromadb faiss-cpu tqdm google-auth google-colab
!pip install -U langchain-community

Collecting chromadb
  Downloading chromadb-1.3.4-cp39-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (7.2 kB)
Collecting faiss-cpu
  Downloading faiss_cpu-1.12.0-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl.metadata (5.1 kB)
Collecting pybase64>=1.4.1 (from chromadb)
  Downloading pybase64-1.4.2-cp312-cp312-manylinux1_x86_64.manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_5_x86_64.whl.metadata (8.7 kB)
Collecting posthog<6.0.0,>=2.4.0 (from chromadb)
  Downloading posthog-5.4.0-py3-none-any.whl.metadata (5.7 kB)
Collecting onnxruntime>=1.14.1 (from chromadb)
  Downloading onnxruntime-1.23.2-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl.metadata (5.1 kB)
Collecting opentelemetry-exporter-otlp-proto-grpc>=1.2.0 (from chromadb)
  Downloading opentelemetry_exporter_otlp_proto_grpc-1.38.0-py3-none-any.whl.metadata (2.4 kB)
Collecting pypika>=0.48.9 (from chromadb)
  Downloading PyPika-0.48.9.tar.gz (67 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━

In [2]:
# @title #2  Load OpenAI API Key from Colab Secrets
from google.colab import userdata
import os

# Replace 'OPENAI_API_KEY' with the name of your stored secret
api_key = userdata.get('OPENAI_API_KEY')
os.environ["OPENAI_API_KEY"] = api_key
print("✅ OpenAI API key loaded from Colab secrets.")


TimeoutException: Requesting secret OPENAI_API_KEY timed out. Secrets can only be fetched when running from the Colab UI.

In [None]:
# @title  #3 Markdown Embedding App for Google Colab
# ---------------------------------------
# Purpose: Convert Markdown files into a vector database stored on Google Drive.
# This script is designed for Colab Free Tier: efficient, fault-tolerant, resumable, and fully logged.

# %% [markdown]
# ## 1. Setup and Dependencies
# Install necessary packages and mount Google Drive.



from google.colab import drive
import os, json, time, uuid, datetime
from tqdm import tqdm
from langchain.embeddings import OpenAIEmbeddings
from langchain.vectorstores import Chroma

drive.mount('/content/drive')

# %% [markdown]
# ## 2. Configuration Input
# Ask the user for input and setup paths.

input_dir = input("Enter the full path to your folder of Markdown files: ").strip()
output_root = "/content/drive/MyDrive/vector_dbs/"
os.makedirs(output_root, exist_ok=True)

version_id = str(uuid.uuid4())
timestamp = datetime.datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
output_dir = os.path.join(output_root, f"session_{timestamp}")
os.makedirs(output_dir, exist_ok=True)

heartbeat_interval = 30  # seconds

# %% [markdown]
# ## 3. File Discovery
# Scan for `.md` files and prepare processing queue.

files_to_process = []
for root, _, files in os.walk(input_dir):
    for f in files:
        if f.endswith('.md'):
            files_to_process.append(os.path.join(root, f))

print(f"Found {len(files_to_process)} Markdown files to process.")

# %% [markdown]
# ## 4. Heartbeat Monitor
# Prints periodic status updates while processing.

import threading

def heartbeat():
    while True:
        print(f"[Heartbeat] App is active... {datetime.datetime.now().strftime('%H:%M:%S')}")
        time.sleep(heartbeat_interval)

threading.Thread(target=heartbeat, daemon=True).start()

# %% [markdown]
# ## 5. Embedding Engine Setup
# Initialize the embedding model and Chroma vector database.

embedding_model = OpenAIEmbeddings(model="text-embedding-3-large")
vector_db = Chroma(collection_name="markdown_knowledge", embedding_function=embedding_model, persist_directory=output_dir)

# %% [markdown]
# ## 6. Processing Loop
# Read, embed, and store each Markdown file.

success_count = 0
fail_count = 0
report_log = []

for file_path in tqdm(files_to_process, desc="Embedding Markdown Files"):
    try:
        with open(file_path, 'r', encoding='utf-8') as f:
            content = f.read()

        if not content.strip():
            raise ValueError("File empty.")

        metadata = {
            "file": os.path.basename(file_path),
            "path": file_path,
            "timestamp": timestamp,
        }

        vector_db.add_texts([content], metadatas=[metadata])
        success_count += 1
        report_log.append({"file": file_path, "status": "Success"})

    except Exception as e:
        fail_count += 1
        report_log.append({"file": file_path, "status": "Failed", "error": str(e)})
        continue

# %% [markdown]
# ## 7. Save and Persist Database
# Commit the database to disk and ensure all data is flushed.

vector_db.persist()

with open(os.path.join(output_dir, "report.json"), 'w') as f:
    json.dump(report_log, f, indent=2)

print(f"\nEmbedding completed: {success_count} success, {fail_count} failed.")

# %% [markdown]
# ## 8. Version Tracking
# Create or update version history file with details for GPT reference.

version_entry = {
    "version_id": version_id,
    "created": timestamp,
    "files_embedded": success_count,
    "failures": fail_count,
    "source_dir": input_dir,
    "db_path": output_dir
}

version_log_path = os.path.join(output_root, "version_history.json")

if os.path.exists(version_log_path):
    with open(version_log_path, 'r') as f:
        history = json.load(f)
else:
    history = []

history.append(version_entry)
with open(version_log_path, 'w') as f:
    json.dump(history, f, indent=2)

print(f"Version history updated. ID: {version_id}")

# %% [markdown]
# ## 9. Final Report
# Generate HTML summary like _report.html for review.

html_report = f"""
<html><head><title>Embedding Report</title></head><body>
<h1>Markdown Embedding Report</h1>
<p><strong>Session:</strong> {timestamp}</p>
<p><strong>Version ID:</strong> {version_id}</p>
<p><strong>Success:</strong> {success_count}</p>
<p><strong>Failures:</strong> {fail_count}</p>
<p><strong>Database Path:</strong> {output_dir}</p>
</body></html>
"""

with open(os.path.join(output_dir, "embedding_report.html"), 'w') as f:
    f.write(html_report)

print(f"Report saved to {output_dir}/embedding_report.html")


In [None]:
# @title 4. (Optional) Load Existing Database Session
#
# Use this cell INSTEAD of running the main embedding process (cell #5)
# if you only want to load an old database (e.g., to export it to JSON).
#
# You must run the setup cells (1-4) first to install packages and load your API key.

import os
from langchain.embeddings import OpenAIEmbeddings
from langchain.vectorstores import Chroma

# --- 1. Initialize Embedding Model ---
# We must initialize the *same* embedding model that was used to create the DB.
try:
    if 'embedding_model' not in locals():
         print("Initializing embedding model (text-embedding-3-large)...")
         embedding_model = OpenAIEmbeddings(model="text-embedding-3-large")
    else:
         print("Embedding model already loaded.")
except Exception as e:
    print(f"❌ Error initializing embedding model: {e}")
    print("Please ensure your OPENAI_API_KEY is set in Cell 4.")

# --- 2. Get Path from User ---
print("\n--- Load Existing Database ---")
print("Provide the full path to the *existing* session folder you want to load.")
print("Example: /content/drive/MyDrive/vector_dbs/session_2025-10-31_12-00-00")
existing_db_path = input("Enter path to session folder: ").strip()

# --- 3. Load the Database ---
if 'embedding_model' in locals() and os.path.isdir(existing_db_path):
    try:
        print(f"\nLoading database from: {existing_db_path}")

        # This points vector_db to your *existing* persisted database
        vector_db = Chroma(
            collection_name="markdown_knowledge",
            embedding_function=embedding_model,
            persist_directory=existing_db_path
        )

        # This global variable is used by the JSON export cell (Cell 10)
        # to know where to save the new JSON files.
        output_dir = existing_db_path

        print(f"✅ Successfully loaded {vector_db._collection.count()} items.")
        print(f"The 'vector_db' variable is now set to this database.")
        print("You can now run the JSON export cell (Cell 10).")

    except Exception as e:
        print(f"\n❌ Error loading database: {e}")
        print("Please check the path and ensure the folder contains a valid Chroma database.")
elif not os.path.isdir(existing_db_path):
    print(f"\n❌ Error: Directory not found at path: {existing_db_path}")
else:
    print(f"\n❌ Error: Embedding model not initialized. Run cell 4.")

In [None]:
# @title 5. (Optional) Export Database to JSON Chunks
# This cell retrieves all embedded data from the Chroma database
# and exports it into a series of JSON files, each limited to ~20MB.
#
# Each JSON file will contain an "export_info" block with the date
# and a unique version timestamp for this specific conversion.

import json
import os
import sys
import datetime
from tqdm import tqdm

# --- Configuration ---
MAX_CHUNK_SIZE_MB = 15.5  # Set to slightly less than 20MB for safety
MAX_BYTES = MAX_CHUNK_SIZE_MB * 1024 * 1024
json_export_dir = os.path.join(output_dir, "json_export")
os.makedirs(json_export_dir, exist_ok=True)
# ---------------------

print(f"Retrieving all data from vector database...")
try:
    # Get all documents, metadatas, and their IDs
    results = vector_db.get(include=["metadatas", "documents"])

    all_data = []
    for i in range(len(results['ids'])):
        all_data.append({
            "id": results['ids'][i],
            "document": results['documents'][i],
            "metadata": results['metadatas'][i]
        })

    # --- NEW: Generate metadata for this export job ---
    total_items_in_db = len(all_data)

    # This timestamp will be the unique "version #" for this conversion
    export_version = datetime.datetime.now(datetime.timezone.utc).strftime("%Y%m%d_%H%M%S_UTC")
    export_date_utc = datetime.datetime.now(datetime.timezone.utc).isoformat()
    # ----------------------------------------------------

    print(f"Retrieved {total_items_in_db} items. Now chunking...")
    print(f"This export's unique version: {export_version}")

    current_chunk_data = [] # This list will hold the items for the current chunk
    chunk_index = 1

    pbar = tqdm(all_data, desc="Exporting to JSON chunks")
    for item in pbar:
        # Add item and check new total size
        current_chunk_data.append(item)

        # --- MODIFIED: Create the full JSON object for size checking ---
        # This is the new structure that will be written to the file
        json_to_write = {
            "export_info": {
                "export_version": export_version,
                "export_date_utc": export_date_utc,
                "total_items_in_db": total_items_in_db,
                "chunk_index": chunk_index,
                "items_in_this_chunk": len(current_chunk_data)
            },
            "chunk_data": current_chunk_data # The list of items
        }

        current_json_string = json.dumps(json_to_write)
        current_size_bytes = len(current_json_string.encode('utf-8'))

        # If chunk exceeds max size, write the *previous* state
        if current_size_bytes > MAX_BYTES and len(current_chunk_data) > 1:
            # Pop the last item that caused the overflow
            item_to_move = current_chunk_data.pop()

            # --- MODIFIED: Create the final object *without* the overflow item ---
            final_chunk_data_list = list(current_chunk_data) # The list *before* the last item
            json_to_write = {
                "export_info": {
                    "export_version": export_version,
                    "export_date_utc": export_date_utc,
                    "total_items_in_db": total_items_in_db,
                    "chunk_index": chunk_index,
                    "items_in_this_chunk": len(final_chunk_data_list)
                },
                "chunk_data": final_chunk_data_list
            }

            # Write the chunk
            json_export_path = os.path.join(json_export_dir, f"data_chunk_{chunk_index:03d}.json")
            with open(json_export_path, 'w', encoding='utf-8') as f:
                json.dump(json_to_write, f, indent=2)

            pbar.set_description(f"Wrote chunk {chunk_index}")

            # Start the new chunk with the item that didn't fit
            current_chunk_data = [item_to_move]
            chunk_index += 1

        elif current_size_bytes > MAX_BYTES and len(current_chunk_data) == 1:
            # This single item is larger than the max chunk size
            # The 'json_to_write' object is already correct (with 1 item)
            json_export_path = os.path.join(json_export_dir, f"data_chunk_{chunk_index:03d}.json")
            with open(json_export_path, 'w', encoding='utf-8') as f:
                json.dump(json_to_write, f, indent=2)

            pbar.set_description(f"Wrote large chunk {chunk_index}")

            # Reset for next loop
            current_chunk_data = []
            chunk_index += 1


    # --- MODIFIED: Write any remaining data in the last chunk ---
    if current_chunk_data:
        json_to_write = {
            "export_info": {
                "export_version": export_version,
                "export_date_utc": export_date_utc,
                "total_items_in_db": total_items_in_db,
                "chunk_index": chunk_index,
                "items_in_this_chunk": len(current_chunk_data)
            },
            "chunk_data": current_chunk_data
        }
        json_export_path = os.path.join(json_export_dir, f"data_chunk_{chunk_index:03d}.json")
        with open(json_export_path, 'w', encoding='utf-8') as f:
            json.dump(json_to_write, f, indent=2)
        pbar.set_description(f"Wrote final chunk {chunk_index}")

    print(f"\n✅ JSON export complete. {chunk_index} file(s) saved in: {json_export_dir}")

except Exception as e:
    print(f"\n❌ An error occurred during JSON export: {e}")
    print("Ensure the variable 'vector_db' (from cell 5) or 'output_dir' (from cell 11) exists.")

In [4]:
from google.colab import drive
drive.mount('/content/drive')

KeyboardInterrupt: 