In [1]:
import chromadb
from chromadb.config import Settings
from transformers import AutoTokenizer, AutoModel
import torch

# Step 1: Initialize Hugging Face model and tokenizer for embeddings
def initialize_embedding_model():
    model_name = "sentence-transformers/all-MiniLM-L6-v2"
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModel.from_pretrained(model_name)
    return tokenizer, model

# Step 2: Generate embeddings for query text
def get_embeddings(text, tokenizer, model):
    inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=512)
    with torch.no_grad():
        outputs = model(**inputs)
    embeddings = outputs.last_hidden_state.mean(dim=1).squeeze().numpy()
    return embeddings

# Step 3: Query the ChromaDB collection
def query_knowledge_base(query, collection_name="python_tutor", persist_directory="../data/chroma_db2", n_results=5):
    # Initialize ChromaDB PersistentClient
    client = chromadb.PersistentClient(path=persist_directory)
    
    try:
        # Access the collection
        collection = client.get_or_create_collection(name=collection_name)
    except Exception as e:
        print(f"Error accessing collection: {e}")
        return []

    # Initialize embedding model
    tokenizer, model = initialize_embedding_model()
    
    # Generate query embedding
    query_embedding = get_embeddings(query, tokenizer, model)
    
    # Query the collection
    results = collection.query(
        query_embeddings=[query_embedding.tolist()],
        n_results=n_results
    )
    
    # Format and return results
    formatted_results = []
    for i, (doc, metadata, distance) in enumerate(zip(results['documents'][0], results['metadatas'][0], results['distances'][0])):
        formatted_results.append({
            'rank': i + 1,
            'text': doc,
            'source': metadata.get('source', 'Unknown'),
            'section': metadata.get('section', 'Unknown'),
            'chunk_id': metadata.get('chunk_id', 'Unknown'),
            'similarity_score': 1 - distance  # Convert distance to similarity (assuming cosine distance)
        })
    return formatted_results

In [2]:
# Step 4: Test the query function with a sample question
query = "How do I define a function in Python?"
results = query_knowledge_base(query)

# Display results
print(f"Query: {query}\n")
for result in results:
    print(f"Rank: {result['rank']}")
    print(f"Source: {result['source']}")
    print(f"Section: {result['section']}")
    print(f"Chunk ID: {result['chunk_id']}")
    print(f"Similarity Score: {result['similarity_score']:.4f}")
    print(f"Text: {result['text'][:500]}...")  # Truncate for readability
    print("-" * 80)

Query: How do I define a function in Python?

