In [1]:
# Install required packages
# %pip install -qU llama-index
# %pip install -qU llama-index-llms-mistralai
# %pip install -qU llama-index-embeddings-mistralai
# %pip install -qU llama-index-vector-stores-chroma
# %pip install -qU pypdf beautifulsoup4 requests chromadb gradio nltk numpy

In [2]:
import os
import re
import requests
import nltk
import numpy as np
from bs4 import BeautifulSoup
from llama_index.core import SimpleDirectoryReader, VectorStoreIndex, Settings, Document, StorageContext
from llama_index.llms.mistralai import MistralAI
# from llama_index.embeddings.mistralai import MistralAIEmbedding
from llama_index.embeddings.huggingface import HuggingFaceEmbedding
from llama_index.vector_stores.chroma import ChromaVectorStore
import chromadb
import gradio as gr
from typing import List, Dict, Any

# Download NLTK data
nltk.download('punkt', quiet=True)
nltk.download('stopwords', quiet=True)
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize, sent_tokenize

print("Libraries imported successfully")

Libraries imported successfully


In [3]:
import os
from dotenv import load_dotenv

# Load environment variables from the .env file
load_dotenv()

# Set up Mistral API key from environment variables
MISTRAL_API_KEY = os.getenv("MISTRAL_API_KEY")

if not MISTRAL_API_KEY:
    print("MISTRAL_API_KEY not found. Please set it in your .env file.")
else:
    os.environ["MISTRAL_API_KEY"] = MISTRAL_API_KEY
    print("API key configured successfully from .env file")

API key configured successfully from .env file


In [4]:
# Context Handler Class for Basic Context Handling
class ContextHandler:
    def __init__(self, max_history=5):
        self.conversation_history = []
        self.max_history = max_history
    
    def add_to_history(self, question: str, answer: str):
        """Add Q&A pair to conversation history"""
        self.conversation_history.append({
            'question': question,
            'answer': answer
        })
        
        # Keep only recent history
        if len(self.conversation_history) > self.max_history:
            self.conversation_history = self.conversation_history[-self.max_history:]
    
    def get_context_string(self) -> str:
        """Get formatted context string for the LLM"""
        if not self.conversation_history:
            return ""
        
        context = "\nPrevious conversation context:\n"
        for i, entry in enumerate(self.conversation_history, 1):
            context += f"Q{i}: {entry['question']}\n"
            context += f"A{i}: {entry['answer'][:200]}...\n\n"  # Truncate long answers
        
        return context
    
    def clear_history(self):
        """Clear conversation history"""
        self.conversation_history = []

# Initialize context handler
context_handler = ContextHandler()
print("Context handler initialized")

Context handler initialized


Models used:
* Embedding model: sentence-transformers/all-MiniLM-L6-v2
* LLM model: codestral 25.01

In [5]:
# Initialize LLM and Embedding models with enhanced system prompt
system_prompt = """You are an expert Python programming tutor and assistant. You help students learn Python programming concepts, explain code, debug issues, and provide clear, practical examples. 
When answering questions:
- Provide clear, step-by-step explanations
- Include relevant code examples when appropriate
- Explain concepts in simple terms for beginners
- If you don't know something from the provided context, say so clearly
- Focus on practical, hands-on learning
- Consider the conversation history when relevant to provide contextual responses
- Build upon previous questions and answers when appropriate"""

llm = MistralAI(
    model="codestral-latest", 
    api_key=MISTRAL_API_KEY,
    system_prompt=system_prompt
)
embed_model = HuggingFaceEmbedding(model_name="sentence-transformers/all-MiniLM-L6-v2")

Settings.llm = llm
Settings.embed_model = embed_model

print("LLM and embedding models initialized with enhanced system prompt")
print(f"Embedding model dimension: {embed_model._model.get_sentence_embedding_dimension()}")

LLM and embedding models initialized with enhanced system prompt
Embedding model dimension: 384


In [6]:
# Web scraping function using BeautifulSoup
def scrape_website(url):
    try:
        headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'}
        response = requests.get(url, headers=headers)
        soup = BeautifulSoup(response.content, 'html.parser')
        
        # Remove script and style elements
        for script in soup(["script", "style"]):
            script.decompose()
        
        text = soup.get_text()
        return text
    except Exception as e:
        print(f"Error scraping {url}: {e}")
        return ""

# Scrape Python tutorial website
website_url = "https://www.geeksforgeeks.org/how-to-learn-python-from-scratch/"
web_content = scrape_website(website_url)

print(f"Scraped content length: {len(web_content)} characters")

Scraped content length: 58391 characters


In [7]:
# Preprocessing function
def preprocess_text(text):
    # Preserve code blocks (anything between triple backticks or indented blocks)
    code_pattern = r'```[\s\S]*?```|(?:^|\n)(?: {4,}|\t)[^\n]*(?:\n(?: {4,}|\t)[^\n]*)*'
    code_blocks = re.findall(code_pattern, text, re.MULTILINE)
    
    # Replace code blocks with placeholders
    for i, block in enumerate(code_blocks):
        text = text.replace(block, f"__CODE_BLOCK_{i}__")
    
    # Clean text (remove extra whitespace, normalize)
    text = re.sub(r'\s+', ' ', text)  # Multiple whitespace to single space
    text = re.sub(r'\n+', '\n', text)  # Multiple newlines to single
    text = text.strip()
    
    # Restore code blocks
    for i, block in enumerate(code_blocks):
        text = text.replace(f"__CODE_BLOCK_{i}__", block)
    
    return text

# Preprocess web content
if web_content:
    web_content = preprocess_text(web_content)
    print("Web content preprocessed successfully")
else:
    print("No web content to preprocess")

Web content preprocessed successfully


In [8]:
# Load PDF documents and combine with web content
pdf_folder_path = "../data/pdfs"
documents = []

# Load PDFs
if os.path.exists(pdf_folder_path):
    pdf_docs = SimpleDirectoryReader(pdf_folder_path).load_data()
    # Preprocess PDF content
    preprocessed_docs = []
    for doc in pdf_docs:
        new_doc = Document(
            text=preprocess_text(doc.text),
            metadata=doc.metadata
        )
        preprocessed_docs.append(new_doc)
    documents.extend(preprocessed_docs)
    print(f"Loaded and preprocessed {len(pdf_docs)} PDF documents")
else:
    print("PDF folder not found")

# Add web content as document
if web_content:
    web_doc = Document(
        text=web_content,
        metadata={"source": "web", "url": website_url}
    )
    documents.append(web_doc)
    print("Added web content as document")

print(f"Total documents: {len(documents)}")

Loaded and preprocessed 1310 PDF documents
Added web content as document
Total documents: 1311


In [9]:
# Display sample chunks after extraction and cleaning
def display_sample_chunks(documents, num_samples=3):
    print("=== SAMPLE DOCUMENT CHUNKS AFTER EXTRACTION AND CLEANING ===")
    print(f"Displaying {min(num_samples, len(documents))} sample chunks:\n")
    
    for i, doc in enumerate(documents[:num_samples]):
        print(f"--- Chunk {i+1} ---")
        print(f"Source: {doc.metadata.get('source', 'Unknown')}")
        if 'file_name' in doc.metadata:
            print(f"File: {doc.metadata['file_name']}")
        if 'url' in doc.metadata:
            print(f"URL: {doc.metadata['url']}")
        print(f"Text length: {len(doc.text)} characters")
        print(f"Text preview (first 500 chars):\n{doc.text[:500]}...")
        print("\n" + "="*80 + "\n")

# Display sample chunks
if documents:
    display_sample_chunks(documents)
else:
    print("No documents loaded to display")

=== SAMPLE DOCUMENT CHUNKS AFTER EXTRACTION AND CLEANING ===
Displaying 3 sample chunks:

--- Chunk 1 ---
Source: Unknown
File: Python Crash Course.pdf
Text length: 2046 characters
Text preview (first 500 chars):
A HANDS-ON , PROJECT-BASED INTRODUCTION TO PROGRAMMING ERIC MATTHES P Y THON C R ASH COURSE P Y THON C R ASH COURSE SHELVE IN: PROGRAMMING LANGUAGES/ PYTHON $39.95 ($45.95 CDN) FAST! LEARN PYTHON— FAST! LEARN PYTHON— PYTHON CRASH COURSEPYTHON CRASH COURSEMATTHES COVERS PYTHON 2 AND 3 Python Crash Course is a fast-paced, thorough intro- duction to programming with Python that will have you writing programs, solving problems, and making things that work in no time. In the first half of the book, y...


--- Chunk 2 ---
Source: Unknown
File: Python Crash Course.pdf
Text length: 19 characters
Text preview (first 500 chars):
Python Crash Course...


--- Chunk 3 ---
Source: Unknown
File: Python Crash Course.pdf
Text length: 0 characters
Text preview (first 500 chars):
...




In [10]:
# Setup ChromaDB for persistence
chroma_client = chromadb.PersistentClient(path="../data/chroma_db2")
chroma_collection = chroma_client.get_or_create_collection("python_docs")
vector_store = ChromaVectorStore(chroma_collection=chroma_collection)
storage_context = StorageContext.from_defaults(vector_store=vector_store)

print("ChromaDB setup completed")
print(f"Collection count: {chroma_collection.count()}")

ChromaDB setup completed
Collection count: 18


In [11]:
# Create or load vector index with caching
if chroma_collection.count() == 0:
    # Create new index if empty
    vector_index = VectorStoreIndex.from_documents(
        documents, 
        storage_context=storage_context,
        show_progress=True
    )
    print("New vector index created and persisted")
else:
    # Load existing index
    vector_index = VectorStoreIndex.from_vector_store(
        vector_store=vector_store,
        storage_context=storage_context
    )
    print("Existing vector index loaded from cache")

print(f"Index contains {chroma_collection.count()} document chunks")

Existing vector index loaded from cache
Index contains 18 document chunks


In [12]:
# Display sample embeddings and their shape
def display_sample_embeddings(num_samples=2):
    print("=== SAMPLE EMBEDDINGS AND THEIR SHAPE ===")
    
    # Get sample texts from documents
    sample_texts = []
    for i, doc in enumerate(documents[:num_samples]):
        # Take first 200 characters as sample
        sample_text = doc.text[:200].strip()
        sample_texts.append(sample_text)
        print(f"Sample text {i+1}: {sample_text}...\n")
    
    if sample_texts:
        # Generate embeddings for sample texts
        embeddings = embed_model.get_text_embedding_batch(sample_texts)
        
        print(f"Number of embeddings generated: {len(embeddings)}")
        
        for i, embedding in enumerate(embeddings):
            embedding_array = np.array(embedding)
            print(f"\nEmbedding {i+1}:")
            print(f"  Shape: {embedding_array.shape}")
            print(f"  Type: {type(embedding_array)}")
            print(f"  Min value: {embedding_array.min():.4f}")
            print(f"  Max value: {embedding_array.max():.4f}")
            print(f"  Mean value: {embedding_array.mean():.4f}")
            print(f"  First 10 values: {embedding_array[:10]}")
            print(f"  Last 10 values: {embedding_array[-10:]}")
    else:
        print("No documents available for embedding demonstration")
    
    print("\n" + "="*80 + "\n")

# Display sample embeddings
if documents:
    display_sample_embeddings()
else:
    print("No documents available for embedding demonstration")

=== SAMPLE EMBEDDINGS AND THEIR SHAPE ===
Sample text 1: A HANDS-ON , PROJECT-BASED INTRODUCTION TO PROGRAMMING ERIC MATTHES P Y THON C R ASH COURSE P Y THON C R ASH COURSE SHELVE IN: PROGRAMMING LANGUAGES/ PYTHON $39.95 ($45.95 CDN) FAST! LEARN PYTHON— FAS...

Sample text 2: Python Crash Course...

Number of embeddings generated: 2

Embedding 1:
  Shape: (384,)
  Type: <class 'numpy.ndarray'>
  Min value: -0.1371
  Max value: 0.1886
  Mean value: -0.0010
  First 10 values: [-0.08057992  0.00041429 -0.01473134 -0.01707213 -0.06654104 -0.10962446
  0.05017243  0.04082484 -0.09665311  0.02742115]
  Last 10 values: [ 1.07379861e-01  5.47668785e-02 -9.17729437e-02 -5.07228906e-05
 -3.57930772e-02  4.36313786e-02  1.64995305e-02  6.50759786e-02
  1.06741174e-03  6.25112876e-02]

Embedding 2:
  Shape: (384,)
  Type: <class 'numpy.ndarray'>
  Min value: -0.1464
  Max value: 0.1609
  Mean value: 0.0008
  First 10 values: [ 0.02638219 -0.03466475 -0.00692113 -0.00566348 -0.00317804 -0.09809611


In [13]:
# Create query engine with default indexing
query_engine = vector_index.as_query_engine(
    similarity_top_k=3,  # Return top 3 most similar chunks
    response_mode="compact",  # Compact response format
)

print("Query engine created with default indexing")

Query engine created with default indexing


In [14]:
# Sample questions for the interface
sample_questions = [
    "What is a Python function?",
    "How do you define and call a void function?",
    "Explain Python data types",
    "What are Python loops?",
    "How to handle errors in Python?",
    "Explain Python classes and objects"
]

# Enhanced function to handle queries with context
def ask_question(question, use_context=True):
    if not question.strip():
        return "Please enter a question.", ""
    
    try:
        # Prepare query with context if enabled
        if use_context:
            context_string = context_handler.get_context_string()
            enhanced_question = f"{context_string}\nCurrent question: {question}"
        else:
            enhanced_question = question
        
        response = query_engine.query(enhanced_question)
        answer = str(response)
        
        # Add to conversation history
        if use_context:
            context_handler.add_to_history(question, answer)
        
        # Format context info
        context_info = f"Context enabled: {use_context}\n"
        context_info += f"History length: {len(context_handler.conversation_history)} items\n"
        if context_handler.conversation_history:
            context_info += f"Last question: {context_handler.conversation_history[-1]['question'][:50]}..."
        
        return answer, context_info
        
    except Exception as e:
        return f"Error processing question: {e}", f"Error occurred: {e}"

# Function for clearing context
def clear_context():
    context_handler.clear_history()
    return "Context cleared successfully!", "Context cleared - no conversation history"

print("Enhanced query functions defined with context handling")

Enhanced query functions defined with context handling


In [15]:
# Create Enhanced Gradio interface that runs internally
def create_interface():
    # Create sample questions HTML
    sample_html = "<h3>Try asking:</h3><ul>"
    for q in sample_questions:
        sample_html += f"<li>{q}</li>"
    sample_html += "</ul>"
    
    with gr.Blocks(title="Python Tutor", theme=gr.themes.Soft()) as interface:
        gr.Markdown("# 🐍 Python Tutor")
        gr.Markdown("Ask me anything about Python programming! I remember our conversation context.")
        
        with gr.Row():
            with gr.Column(scale=2):
                question_input = gr.Textbox(
                    label="Your Question",
                    placeholder="Enter your Python question here...",
                    lines=3
                )
                
                with gr.Row():
                    submit_btn = gr.Button("Ask Question", variant="primary", scale=2)
                    context_toggle = gr.Checkbox(
                        label="Use Context", 
                        value=True, 
                        info="Remember conversation history",
                        scale=1
                    )
                    clear_btn = gr.Button("Clear Context", variant="secondary", scale=1)
                
            with gr.Column(scale=1):
                gr.HTML(sample_html)
                
                context_info = gr.Textbox(
                    label="Context Info",
                    lines=4,
                    value="Context enabled: True\nHistory length: 0 items",
                    interactive=False
                )
        
        answer_output = gr.Textbox(
            label="Answer",
            lines=12,
            max_lines=25
        )
        
        # Event handlers
        submit_btn.click(
            fn=ask_question,
            inputs=[question_input, context_toggle],
            outputs=[answer_output, context_info]
        )
        
        question_input.submit(
            fn=ask_question,
            inputs=[question_input, context_toggle],
            outputs=[answer_output, context_info]
        )
        
        clear_btn.click(
            fn=clear_context,
            outputs=[answer_output, context_info]
        )
        
        # Example questions as buttons
        gr.Markdown("### Quick Examples:")
        with gr.Row():
            for i in range(0, len(sample_questions), 2):
                if i < len(sample_questions):
                    btn = gr.Button(sample_questions[i], size="sm")
                    btn.click(
                        lambda q=sample_questions[i]: q,
                        outputs=question_input
                    )
                if i+1 < len(sample_questions):
                    btn = gr.Button(sample_questions[i+1], size="sm")
                    btn.click(
                        lambda q=sample_questions[i+1]: q,
                        outputs=question_input
                    )
    
    return interface

# Create interface
app = create_interface()
print("Enhanced Gradio interface created with context handling")

Enhanced Gradio interface created with context handling


In [16]:
# Test the system with a sample question
test_question = "What is a Python function?"
print(f"Test Question: {test_question}")
test_response, test_context = ask_question(test_question)
print(f"\nTest Response: {test_response[:500]}...")
print(f"\nContext Info: {test_context}")

print("\n=== System Ready ===")
print("Run the next cell to launch the Gradio interface internally")

Test Question: What is a Python function?

Test Response: A Python function is a block of code that performs a specific task. Functions are used to organize code into reusable pieces, making it easier to read, write, and maintain. They can take inputs, known as arguments, and return outputs. Functions are defined using the `def` keyword followed by the function name and parentheses. Here's a simple example:

```python
def greet():
    print("Hello, Geeks!")

greet()  # This will print "Hello, Geeks!"
```

In this example, `greet` is a function that pri...

Context Info: Context enabled: True
History length: 1 items
Last question: What is a Python function?...

=== System Ready ===
Run the next cell to launch the Gradio interface internally


In [17]:
# Launch Gradio interface internally (runs inside the notebook)
app.launch(
    share=False,  # Keep it internal
    inbrowser=True,  # Open in browser tab
    inline=True,  # Display inline in notebook
    height=800,   # Set height for inline display
    quiet=True    # Reduce output noise
)



In [20]:
# Test Utility Function - Comprehensive testing of all steps
def run_comprehensive_test():
    """Test all components of the Python Learning Assistant system"""
    
    print("COMPREHENSIVE SYSTEM TEST")
    print("=" * 60)
    
    test_results = {
        'document_loading': False,
        'preprocessing': False,
        'embedding': False,
        'vector_store': False,
        'query_engine': False,
        'context_handling': False,
        'basic_query': False,
        'context_query': False
    }
    
    # Test 1: Document Loading
    print("\n1. Testing Document Loading...")
    try:
        if documents and len(documents) > 0:
            print(f"    Documents loaded: {len(documents)}")
            test_results['document_loading'] = True
        else:
            print("    No documents loaded")
    except Exception as e:
        print(f"    Document loading error: {e}")
    
    # Test 2: Preprocessing
    print("\n2. Testing Text Preprocessing...")
    try:
        test_text = "This is a   test\n\n\nwith    multiple spaces."
        processed = preprocess_text(test_text)
        if len(processed) < len(test_text):  # Should be cleaned
            print(f"    Preprocessing working (reduced from {len(test_text)} to {len(processed)} chars)")
            test_results['preprocessing'] = True
        else:
            print("     Preprocessing may not be working optimally")
    except Exception as e:
        print(f"    Preprocessing error: {e}")
    
    # Test 3: Embedding Generation
    print("\n3. Testing Embedding Generation...")
    try:
        test_text = "Python is a programming language"
        embedding = embed_model.get_text_embedding(test_text)
        embedding_array = np.array(embedding)
        print(f"    Embedding generated: shape {embedding_array.shape}")
        print(f"    Stats: min={embedding_array.min():.4f}, max={embedding_array.max():.4f}, mean={embedding_array.mean():.4f}")
        test_results['embedding'] = True
    except Exception as e:
        print(f"    Embedding error: {e}")
    
    # Test 4: Vector Store
    print("\n4. Testing Vector Store...")
    try:
        collection_count = chroma_collection.count()
        if collection_count > 0:
            print(f"    Vector store working: {collection_count} vectors stored")
            test_results['vector_store'] = True
        else:
            print("    Vector store empty")
    except Exception as e:
        print(f"    Vector store error: {e}")
    
    # Test 5: Query Engine
    print("\n5. Testing Query Engine...")
    try:
        if query_engine:
            print("    Query engine initialized")
            test_results['query_engine'] = True
        else:
            print("    Query engine not initialized")
    except Exception as e:
        print(f"    Query engine error: {e}")
    
    # Test 6: Context Handling
    print("\n6. Testing Context Handling...")
    try:
        # Clear context first
        context_handler.clear_history()
        
        # Add test conversation
        context_handler.add_to_history("What is Python?", "Python is a programming language")
        context_handler.add_to_history("What are variables?", "Variables store data values")
        
        context_string = context_handler.get_context_string()
        history_length = len(context_handler.conversation_history)
        
        if history_length == 2 and len(context_string) > 0:
            print(f"    Context handling working: {history_length} items in history")
            print(f"    Context string length: {len(context_string)} characters")
            test_results['context_handling'] = True
        else:
            print(f"    Context handling issue: {history_length} items, context length: {len(context_string)}")
    except Exception as e:
        print(f"    Context handling error: {e}")
    
    # Test 7: Basic Query (without context)
    print("\n7. Testing Basic Query (no context)...")
    try:
        context_handler.clear_history()  # Clear for clean test
        test_question = "What is a Python function?"
        response, context_info = ask_question(test_question, use_context=False)
        
        if response and "Error" not in response and len(response) > 50:
            print(f"    Basic query working: response length {len(response)} chars")
            print(f"    Sample response: {response[:100]}...")
            test_results['basic_query'] = True
        else:
            print(f"    Basic query failed: {response[:100]}...")
    except Exception as e:
        print(f"    Basic query error: {e}")
    
    # Test 8: Context-Aware Query
    print("\n8. Testing Context-Aware Query...")
    try:
        # First question to establish context
        response1, _ = ask_question("What are Python data types?", use_context=True)
        
        # Follow-up question that should use context
        response2, context_info = ask_question("Can you give me examples of these?", use_context=True)
        
        if response2 and "Error" not in response2 and len(context_handler.conversation_history) >= 2:
            print(f"    Context-aware query working: {len(context_handler.conversation_history)} items in history")
            print(f"    Follow-up response: {response2[:100]}...")
            print(f"    Context info: {context_info}")
            test_results['context_query'] = True
        else:
            print(f"    Context-aware query failed")
    except Exception as e:
        print(f"    Context-aware query error: {e}")
    
    # Test Summary
    print("\n" + "=" * 60)
    print(" TEST SUMMARY")
    print("=" * 60)
    
    passed_tests = sum(test_results.values())
    total_tests = len(test_results)
    
    for test_name, result in test_results.items():
        status = " PASS" if result else "❌ FAIL"
        print(f"{test_name.replace('_', ' ').title():<25} {status}")
    
    print(f"\nOverall Result: {passed_tests}/{total_tests} tests passed")
    
    if passed_tests == total_tests:
        print("🎉 ALL TESTS PASSED! System is fully functional.")
    elif passed_tests >= total_tests * 0.75:
        print("  Most tests passed. System is mostly functional with minor issues.")
    else:
        print(" Multiple test failures. System needs debugging.")
    
    # Performance metrics
    print("\n PERFORMANCE METRICS")
    print("=" * 40)
    
    try:
        # Embedding performance test
        import time
        test_texts = ["Python function", "Variable declaration", "Loop iteration"]
        start_time = time.time()
        embeddings = embed_model.get_text_embedding_batch(test_texts)
        embedding_time = time.time() - start_time
        print(f"Embedding speed: {len(test_texts)/embedding_time:.2f} texts/second")
        
        # Query performance test
        start_time = time.time()
        response, _ = ask_question("What is Python?", use_context=False)
        query_time = time.time() - start_time
        print(f"Query response time: {query_time:.2f} seconds")
        
        # Memory usage (approximate)
        if documents:
            total_chars = sum(len(doc.text) for doc in documents)
            print(f"Document corpus size: {total_chars:,} characters")
        
        print(f"Vector store size: {chroma_collection.count():,} vectors")
        
    except Exception as e:
        print(f"Performance metrics error: {e}")
    
    print("\n RECOMMENDATIONS")
    print("=" * 40)
    
    if not test_results['document_loading']:
        print("- Check PDF folder path and ensure documents are available")
    if not test_results['vector_store']:
        print("- Verify ChromaDB setup and document indexing")
    if not test_results['basic_query']:
        print("- Check LLM API key and connection")
    if not test_results['context_query']:
        print("- Verify context handling implementation")
    
    if passed_tests == total_tests:
        print("- System is optimally configured!")
        print("- Consider adding more documents to improve knowledge base")
        print("- Monitor query performance and adjust similarity_top_k if needed")
    
    return test_results

# Additional utility functions for testing individual components

def test_embedding_similarity():
    """Test embedding similarity between related texts"""
    print("\n EMBEDDING SIMILARITY TEST")
    print("=" * 40)
    
    test_pairs = [
        ("Python function", "Python method"),
        ("for loop", "while loop"),
        ("variable", "constant"),
        ("Python", "Java")  # Should be less similar
    ]
    
    for text1, text2 in test_pairs:
        try:
            emb1 = np.array(embed_model.get_text_embedding(text1))
            emb2 = np.array(embed_model.get_text_embedding(text2))
            
            # Calculate cosine similarity
            similarity = np.dot(emb1, emb2) / (np.linalg.norm(emb1) * np.linalg.norm(emb2))
            print(f"'{text1}' vs '{text2}': {similarity:.4f}")
            
        except Exception as e:
            print(f"Error calculating similarity for '{text1}' vs '{text2}': {e}")

def test_context_memory():
    """Test context memory limits and behavior"""
    print("\n CONTEXT MEMORY TEST")
    print("=" * 40)
    
    # Clear context
    context_handler.clear_history()
    
    # Add more items than max_history
    for i in range(context_handler.max_history + 3):
        context_handler.add_to_history(f"Question {i+1}", f"Answer {i+1}")
    
    print(f"Added {context_handler.max_history + 3} items")
    print(f"Context history length: {len(context_handler.conversation_history)}")
    print(f"Max history setting: {context_handler.max_history}")
    
    if len(context_handler.conversation_history) == context_handler.max_history:
        print(" Context memory limit working correctly")
    else:
        print(" Context memory limit not working as expected")
    
    # Show what's in memory
    print("\nItems in context memory:")
    for i, item in enumerate(context_handler.conversation_history):
        print(f"  {i+1}. Q: {item['question']}")

def run_quick_test():
    """Quick test of core functionality"""
    print(" QUICK SYSTEM TEST")
    print("=" * 30)
    
    try:
        # Test basic query
        response, _ = ask_question("What is Python?", use_context=False)
        if response and len(response) > 20:
            print(" Basic functionality working")
            return True
        else:
            print(" Basic functionality failed")
            return False
    except Exception as e:
        print(f" Quick test error: {e}")
        return False

print(" Test utility functions defined")
print("\nAvailable test functions:")
print("- run_comprehensive_test(): Complete system test")
print("- test_embedding_similarity(): Test embedding quality")
print("- test_context_memory(): Test context handling limits")
print("- run_quick_test(): Quick functionality check")
print("\nRun any of these functions to test the system!")

 Test utility functions defined

Available test functions:
- run_comprehensive_test(): Complete system test
- test_embedding_similarity(): Test embedding quality
- test_context_memory(): Test context handling limits
- run_quick_test(): Quick functionality check

Run any of these functions to test the system!


In [19]:
test_context_memory()


 CONTEXT MEMORY TEST
Added 8 items
Context history length: 5
Max history setting: 5
 Context memory limit working correctly

Items in context memory:
  1. Q: Question 4
  2. Q: Question 5
  3. Q: Question 6
  4. Q: Question 7
  5. Q: Question 8
