In [4]:
import os
import chromadb
from sentence_transformers import SentenceTransformer

# Configurations
DOCS_FOLDER = r"..\demo_bot_data"  # Change this to your folder path
EMBEDDING_MODEL = "sentence-transformers/all-MiniLM-L6-v2"
DB_PATH = "chromadb_store"

# Load embedding model
model = SentenceTransformer(EMBEDDING_MODEL)

# Initialize ChromaDB client
client = chromadb.PersistentClient(path=DB_PATH)
collection = client.get_or_create_collection(name="ubuntu_docs")

# Function to read Markdown files
def load_markdown_files(folder_path):
    docs = []
    for root, _, files in os.walk(folder_path):
        for file in files:
            if file.endswith(".md"):
                file_path = os.path.join(root, file)
                with open(file_path, "r", encoding="utf-8") as f:
                    text = f.read()
                    docs.append((file, text))
    return docs

# Load documents
documents = load_markdown_files(DOCS_FOLDER)


# Process and store in ChromaDB
for idx, (filename, text) in enumerate(documents):
    embedding = model.encode(text).tolist()
    
    # Store both filename and content in metadata
    collection.add(
        ids=[str(idx)],
        embeddings=[embedding],
        metadatas=[{"filename": filename, "content": text}]  # Ensure 'content' is stored
    )

print("✅ Ubuntu documentation successfully stored in ChromaDB!")

# Example Query
# def query_docs(query_text, top_k=5):
#     query_embedding = model.encode(query_text).tolist()
#     results = collection.query(query_embeddings=[query_embedding], n_results=top_k)
#     return results["metadatas"]

# def query_docs(query_text, top_k=5):
#     query_embedding = model.encode(query_text).tolist()
#     results = collection.query(query_embeddings=[query_embedding], n_results=top_k)

#     # Extract metadata properly
#     metadatas = results.get("metadatas", [[]])[0]  # Extract first list from the nested structure
#     return [metadata.get("filename", "Unknown") for metadata in metadatas] 

✅ Ubuntu documentation successfully stored in ChromaDB!


In [5]:
def query_docs(query_text, top_k=5):
    query_embedding = model.encode(query_text).tolist()
    results = collection.query(query_embeddings=[query_embedding], n_results=top_k)
    metadatas = results.get("metadatas", [[]])[0]
    return [metadata.get("content", "") for metadata in metadatas]
query = "How to install packages in Ubuntu?"

print(query_docs(query))



In [19]:
query = "How to save the image in ubuntu?"
content1 = str(query_docs(query))

In [49]:
collection.delete(ids=[str(i) for i in range(len(documents))])  # Clear previous data
