In [3]:
import os
from dotenv import load_dotenv
import chromadb
from openai import OpenAI
from chromadb.utils import embedding_functions
import streamlit as st

# Load environment
load_dotenv()
openai_key = os.getenv("OPENAI_API_KEY")

# Load documents from directory
def load_documents_from_directory(directory_path):
    documents = []
    for filename in os.listdir(directory_path):
        if filename.endswith(".md"):
            with open(os.path.join(directory_path, filename), "r", encoding="utf-8") as file:
                documents.append({"id": filename, "text": file.read()})
    return documents

def split_text(text, chunk_size=1000, chunk_overlap=20):
    chunks = []
    start = 0
    while start < len(text):
        end = start + chunk_size
        chunks.append(text[start:end])
        start = end - chunk_overlap
    return chunks

# This function now receives the OpenAI client
def get_openai_embedding(text, openai_client):
    response = openai_client.embeddings.create(input=text, model="text-embedding-3-small")
    embedding = response.data[0].embedding
    return embedding

def create_embeddings():
    # Initialize consistent Chroma and OpenAI clients
    chroma_client = chromadb.PersistentClient(path="./data/chroma_persistent_storage")
    openai_client = OpenAI(api_key=openai_key)

    collection_name = "document_qa_collection"

    # Delete old collection (if it exists)
    try:
        chroma_client.delete_collection(name=collection_name)
        print("✅ Deleted previous collection.")
    except Exception as e:
        print(f"⚠️ Could not delete collection: {e}")

    # Initialize embedding function for Chroma
    openai_ef = embedding_functions.OpenAIEmbeddingFunction(
        api_key=openai_key,
        model_name="text-embedding-3-small",
    )

    # Recreate collection with embedding function
    collection = chroma_client.get_or_create_collection(
        name=collection_name,
        embedding_function=openai_ef
    )

    # Load markdown files
    directory_path = "./data/pages"
    if not os.path.exists(directory_path):
        raise FileNotFoundError(f"❌ Directory '{directory_path}' not found. Run layout analysis first.")

    documents = load_documents_from_directory(directory_path)
    print(f"📄 Loaded {len(documents)} pages")

    # Split text into chunks
    chunked_documents = []
    for doc in documents:
        chunks = split_text(doc['text'])
        for i, chunk in enumerate(chunks):
            chunked_documents.append({"id": f"{doc['id']}_chunk{i+1}", "text": chunk})

    print(f"✂️ Split into {len(chunked_documents)} chunks")

    # Generate embeddings and store
    for doc in chunked_documents:
        print("🧠 Generating embeddings...")
        doc["embedding"] = get_openai_embedding(doc["text"], openai_client)

    for doc in chunked_documents:
        print("📥 Inserting chunk into ChromaDB...")
        collection.upsert(
            ids=[doc["id"]],
            documents=[doc["text"]],
            embeddings=[doc["embedding"]]
        )

    st.success("✅ Embeddings generated and stored successfully.")


In [8]:
chroma_client = chromadb.PersistentClient(path="./data/chroma_persistent_storage")
openai_client = OpenAI(api_key=openai_key)

collection_name = "document_qa_collection"
client = chromadb.PersistentClient(path="./data/chroma_persistent_storage")
client.get_or_create_collection(name="init_collection")
try:
    chroma_client.delete_collection(name=collection_name)
    print("✅ Deleted previous collection.")
except Exception as e:
    
    print(f"⚠️ Could not delete collection: {e}")
    
# Initialize embedding function for Chroma
openai_ef = embedding_functions.OpenAIEmbeddingFunction(
    api_key=openai_key,
    model_name="text-embedding-3-small",
)

# Recreate collection with embedding function
collection = chroma_client.get_or_create_collection(
    name=collection_name,
    embedding_function=openai_ef
)

⚠️ Could not delete collection: Collection document_qa_collection does not exist.


OperationalError: attempt to write a readonly database

In [4]:
create_embeddings()

✅ Deleted previous collection.
📄 Loaded 11 pages
✂️ Split into 71 chunks
🧠 Generating embeddings...
🧠 Generating embeddings...
🧠 Generating embeddings...
🧠 Generating embeddings...
🧠 Generating embeddings...
🧠 Generating embeddings...
🧠 Generating embeddings...
🧠 Generating embeddings...
🧠 Generating embeddings...
🧠 Generating embeddings...
🧠 Generating embeddings...
🧠 Generating embeddings...
🧠 Generating embeddings...
🧠 Generating embeddings...
🧠 Generating embeddings...
🧠 Generating embeddings...
🧠 Generating embeddings...
🧠 Generating embeddings...
🧠 Generating embeddings...
🧠 Generating embeddings...
🧠 Generating embeddings...
🧠 Generating embeddings...
🧠 Generating embeddings...
🧠 Generating embeddings...
🧠 Generating embeddings...
🧠 Generating embeddings...
🧠 Generating embeddings...
🧠 Generating embeddings...
🧠 Generating embeddings...
🧠 Generating embeddings...
🧠 Generating embeddings...
🧠 Generating embeddings...
🧠 Generating embeddings...
🧠 Generating embeddings...
🧠 Generat



📥 Inserting chunk into ChromaDB...
📥 Inserting chunk into ChromaDB...
📥 Inserting chunk into ChromaDB...
📥 Inserting chunk into ChromaDB...
📥 Inserting chunk into ChromaDB...
📥 Inserting chunk into ChromaDB...
📥 Inserting chunk into ChromaDB...
📥 Inserting chunk into ChromaDB...
📥 Inserting chunk into ChromaDB...
📥 Inserting chunk into ChromaDB...
📥 Inserting chunk into ChromaDB...
📥 Inserting chunk into ChromaDB...
📥 Inserting chunk into ChromaDB...
📥 Inserting chunk into ChromaDB...
📥 Inserting chunk into ChromaDB...
📥 Inserting chunk into ChromaDB...
📥 Inserting chunk into ChromaDB...
📥 Inserting chunk into ChromaDB...
📥 Inserting chunk into ChromaDB...
📥 Inserting chunk into ChromaDB...
📥 Inserting chunk into ChromaDB...
📥 Inserting chunk into ChromaDB...
📥 Inserting chunk into ChromaDB...
📥 Inserting chunk into ChromaDB...
📥 Inserting chunk into ChromaDB...
📥 Inserting chunk into ChromaDB...
📥 Inserting chunk into ChromaDB...
📥 Inserting chunk into ChromaDB...
📥 Inserting chunk in