In [1]:
# ========================================================================
# PART 0: ENVIRONMENT SETUP AND LIBRARY VERSION CHECK
# ========================================================================
# LEARNING OBJECTIVE: Verify environment setup and library compatibility

def check_library_versions():
    """
    WORKSHOP FUNCTION: Environment Verification
    
    PURPOSE: Check installed library versions for compatibility
    This helps ensure all students have the same environment setup
    """
    print("="*60)
    print("🔧 WORKSHOP ENVIRONMENT CHECK")
    print("="*60)
    
    required_libraries = {
        'langchain': '0.3.27',
        'langchain_community': '0.3.29',
        'chromadb': '1.0.20',
        'pypdf': '6.0.0',
        'numpy': '6.0.0',
        'pathlib': 'built-in',
        'os': 'built-in',
        'sys': 'built-in'
    }
    
    print("📋 Checking required libraries and versions:")
    print("-" * 50)
    
    missing_libraries = []
    version_mismatches = []
    
    for library, min_version in required_libraries.items():
        try:
            if library in ['pathlib', 'os', 'sys']:
                print(f"✅ {library}: {min_version}")
                continue
                
            if library == 'langchain':
                import langchain
                version = langchain.__version__
            elif library == 'langchain_community':
                import langchain_community
                version = getattr(langchain_community, '__version__', 'unknown')
            elif library == 'chromadb':
                import chromadb
                version = chromadb.__version__
            elif library == 'pypdf':
                import pypdf
                version = pypdf._version.__version__
            elif library == 'numpy':
                import numpy
                version = numpy.__version__
            
            print(f"✅ {library}: {version}")
            
        except ImportError:
            print(f"❌ {library}: NOT INSTALLED")
            missing_libraries.append(library)
        except Exception as e:
            print(f"⚠️  {library}: Error checking version - {e}")
    
    # Check Ollama availability (external dependency)
    print("\n🤖 Checking Ollama setup:")
    print("-" * 30)
    try:
        import subprocess
        result = subprocess.run(['ollama', 'list'], capture_output=True, text=True, timeout=10)
        if result.returncode == 0:
            if 'phi3:mini' in result.stdout:
                print("✅ Ollama: Installed and phi3:mini model available")
            else:
                print("⚠️  Ollama: Installed but phi3:mini model missing")
                print("   Run: ollama pull phi3:mini")
        else:
            print("❌ Ollama: Not properly configured")
    except FileNotFoundError:
        print("❌ Ollama: Not installed")
        print("   Install from: https://ollama.ai/")
    except subprocess.TimeoutExpired:
        print("⚠️  Ollama: Connection timeout - check if service is running")
    except Exception as e:
        print(f"⚠️  Ollama: Error checking - {e}")
    
    # Summary and installation commands
    if missing_libraries:
        print(f"\n❌ MISSING LIBRARIES: {', '.join(missing_libraries)}")
        print("\n📦 EXACT INSTALLATION COMMANDS (Workshop Tested Versions):")
        print("pip install langchain==0.3.27")
        print("pip install langchain-community==0.3.29")
        print("pip install chromadb==1.0.20")
        print("pip install pypdf==6.0.0")
        print("pip install numpy==6.0.0")
        print("\nRun these commands and restart the workshop.")
        return False
    else:
        print("\n✅ ALL LIBRARIES INSTALLED!")
        print("🚀 Ready to proceed with the workshop!")
        return True

# Run environment check
environment_ready = check_library_versions()

if not environment_ready:
    print("\n⚠️  PLEASE INSTALL MISSING LIBRARIES BEFORE CONTINUING")
    print("Uncomment the sys.exit() line below if you want to stop here")
    # sys.exit(1)  # Students can uncomment this to stop execution

🔧 WORKSHOP ENVIRONMENT CHECK
📋 Checking required libraries and versions:
--------------------------------------------------
✅ langchain: 0.3.27
✅ langchain_community: 0.3.29
✅ chromadb: 1.2.1
✅ pypdf: 6.1.3
✅ numpy: 2.3.3
✅ pathlib: built-in
✅ os: built-in
✅ sys: built-in

🤖 Checking Ollama setup:
------------------------------
✅ Ollama: Installed and phi3:mini model available

✅ ALL LIBRARIES INSTALLED!
🚀 Ready to proceed with the workshop!


HANDS-ON RAG (Retrieval-Augmented Generation) WORKSHOP

13 Oct 2025
Ramaih University of Applied Sciences
Instructor: Naganathan Muthuramalingam., PhD Scholar - School of Social Sciences

This script demonstrates a complete end-to-end RAG system implementation.

WHAT YOU'LL LEARN:
1. Document Loading and Processing
2. Text Chunking Strategies
3. Vector Embeddings and Storage
4. Retrieval Mechanisms
5. LLM Integration
6. Answer Validation and Grounding

WORKSHOP STRUCTURE:
- Part 0: Environment Setup and Library Version Check
- Part 1: Imports and Document Discovery
- Part 2: Document Loading and Text Chunking
- Part 3: Vector Embeddings & Knowledge Base Creation
- Part 4: Retrieval Configuration
- Part 5: Language Model Setup
- Part 6: Prompt Engineering for Grounding
- Part 7: RAG Chain Assembly
- Part 8: Answer Validation System
- Part 9: Hands-on Testing

SYSTEM REQUIREMENTS:
- Minimum 8GB RAM (16GB recommended for better performance)
- At least 20GB free disk space for models and vector databases
- Python 3.8+ installed
- Stable internet connection for initial model downloads
- Ollama installed (https://ollama.ai/)
- phi3:mini model downloaded via: ollama pull phi3:mini

INSTALLATION STEPS:
1. Install Python 3.8+
2. Install Ollama from https://ollama.ai/
3. Run: ollama pull phi3:mini
4. Install required Python packages (see Part 0 below)
5. Create 'data' folder and add PDF documents

PREREQUISITES:
- Basic Python knowledge
- Understanding of machine learning concepts
- Familiarity with NLP basics

In [2]:
# ========================================================================
# PART 1: IMPORTS AND SETUP
# ========================================================================
# Standard library imports - Python's built-in modules
import os
import sys
from pathlib import Path

# LangChain Document Loaders & Processing - For handling different document types
from langchain_community.document_loaders import PyPDFLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter

# Vector Store and Embeddings - For semantic search capabilities
from langchain_community.vectorstores import Chroma

# Local LLM via Ollama - For running language models locally
from langchain_community.llms import Ollama

# RAG Chain - For combining retrieval and generation
from langchain.chains.retrieval_qa.base import RetrievalQA
from langchain.prompts import PromptTemplate


In [3]:
# ========================================================================
# WORKSHOP ACTIVITY 1: DOCUMENT DISCOVERY
# ========================================================================
# LEARNING OBJECTIVE: Understand how to locate and validate data sources

# Define the path to your PDF directory
# TODO for students: Create a 'data' folder and add your PDF documents

data_dir = "./data"

# Find all PDF files in the directory recursively
# This uses Path.rglob() to search through all subdirectories

pdf_files = [str(p) for p in Path(data_dir).rglob("*.pdf") if p.is_file()]

# Validation: Always check if your data exists before processing
if not pdf_files:
    print(f"No PDFs found in {data_dir}. Please add your PDFs and update the `data_dir` variable.")
    print("WORKSHOP TIP: Create the './data' folder and add at least one PDF document")
else:
    print(f"✅ Found {len(pdf_files)} PDF(s):")
    for f in pdf_files:
        print(f" - {f}")



✅ Found 5 PDF(s):
 - data\10-Tips-Healthy-Lifestyle.pdf
 - data\Adult-Guide-to-an-Active-Healthy-Lifestyle.pdf
 - data\how-can-i-make-lifestyle-healthier.pdf
 - data\nnm_tipsheet.pdf
 - data\PAG_ExecutiveSummary.pdf


### Why choose this topic?
The chosen topic is Health, Fitness, and Wellbeing, focusing on practical strategies for maintaining a balanced lifestyle. The selected documents cover nutrition, physical activity, mental wellness, and overall healthy habits. These materials provide clear, structured guidance suitable for text analysis and processing. This domain allows testing the LLM’s ability to handle real-world, instructional content.

In [4]:
# ========================================================================
# WORKSHOP ACTIVITY 2: DOCUMENT LOADING AND PREPROCESSING
# ========================================================================
# LEARNING OBJECTIVE: Transform unstructured documents into structured data


print("\n" + "="*50)
print("PART 2: DOCUMENT LOADING & TEXT CHUNKING")
print("="*50)

# Initialize document storage
documents = []

# Process each PDF file
for file_path in pdf_files:
    try:
        print(f"\n📄 Processing: {os.path.basename(file_path)}")
        
        # PyPDFLoader: Specialized for PDF documents
        # WORKSHOP NOTE: Different loaders exist for different file types
        # (TextLoader, CSVLoader, JSONLoader, etc.)
        loader = PyPDFLoader(file_path)
        
        # Load documents - each page becomes a separate document
        docs = loader.load()
        
        # Add source metadata for traceability
        # WORKSHOP TIP: Metadata is crucial for citation and verification
        for doc in docs:
            doc.metadata["source"] = os.path.basename(file_path)
            
        documents.extend(docs)
        print(f"✅ Loaded {len(docs)} pages from {os.path.basename(file_path)}")
        
    except Exception as e:
        print(f"❌ Error loading {file_path}: {e}")
        print("WORKSHOP TIP: Check file permissions and format compatibility")

print(f"\n📊 SUMMARY: Total pages loaded: {len(documents)}")



PART 2: DOCUMENT LOADING & TEXT CHUNKING

📄 Processing: 10-Tips-Healthy-Lifestyle.pdf
✅ Loaded 2 pages from 10-Tips-Healthy-Lifestyle.pdf

📄 Processing: Adult-Guide-to-an-Active-Healthy-Lifestyle.pdf
✅ Loaded 18 pages from Adult-Guide-to-an-Active-Healthy-Lifestyle.pdf

📄 Processing: how-can-i-make-lifestyle-healthier.pdf
✅ Loaded 2 pages from how-can-i-make-lifestyle-healthier.pdf

📄 Processing: nnm_tipsheet.pdf
✅ Loaded 1 pages from nnm_tipsheet.pdf

📄 Processing: PAG_ExecutiveSummary.pdf
✅ Loaded 7 pages from PAG_ExecutiveSummary.pdf

📊 SUMMARY: Total pages loaded: 30


In [5]:
#CHUNKING 
from langchain.text_splitter import RecursiveCharacterTextSplitter

# Two chunking configurations to test
configs = [
    {"name": "Small chunks", "chunk_size": 400, "chunk_overlap": 200},
    {"name": "Large chunks", "chunk_size": 1200, "chunk_overlap": 50}
]

# Loop through each config and split documents
for cfg in configs:
    print("\n" + "="*50)
    print(f"Testing config: {cfg['name']}")
    print("="*50)
    
    splitter = RecursiveCharacterTextSplitter(
        chunk_size=cfg["chunk_size"],
        chunk_overlap=cfg["chunk_overlap"],
        separators=["\n\n", "\n", ". ", "! ", "? ", " ", ""]
    )
    
    chunks = splitter.split_documents(documents)
    
    # Add metadata for each chunk
    for i, text in enumerate(chunks):
        text.metadata["chunk_id"] = i
        text.metadata["chunk_length"] = len(text.page_content)
        text.metadata["preview"] = text.page_content[:50].replace("\n", " ")
    
    total_chunks = len(chunks)
    avg_len = sum(len(c.page_content) for c in chunks)/total_chunks if total_chunks else 0
    sample_preview = chunks[0].page_content[:100].replace("\n"," ") if chunks else "N/A"
    
    print(f"🔧 Chunking Configuration:")
    print(f"   - Chunk size: {cfg['chunk_size']}")
    print(f"   - Chunk overlap: {cfg['chunk_overlap']}")
    print(f"Total chunks created: {total_chunks}")
    print(f"Average chunk length: {avg_len:.0f} characters")
    print(f"Sample preview: {sample_preview}...")

# Recommendation:
print("\n✅ Based on total chunks, average length, and preview readability, choose the configuration that balances context with chunk count. For your Health PDFs, large chunks (1200/50) usually work best.")



Testing config: Small chunks
🔧 Chunking Configuration:
   - Chunk size: 400
   - Chunk overlap: 200
Total chunks created: 234
Average chunk length: 360 characters
Sample preview: 10 TIPS FOR MAINTAINING A HEALTHY  LIFESTYLE AND BODY WEIGHT  Yiqing Song, Professor of Epidemiology...

Testing config: Large chunks
🔧 Chunking Configuration:
   - Chunk size: 1200
   - Chunk overlap: 50
Total chunks created: 58
Average chunk length: 871 characters
Sample preview: 10 TIPS FOR MAINTAINING A HEALTHY  LIFESTYLE AND BODY WEIGHT  Yiqing Song, Professor of Epidemiology...

✅ Based on total chunks, average length, and preview readability, choose the configuration that balances context with chunk count. For your Health PDFs, large chunks (1200/50) usually work best.


### Which chunking settings worked better and why?

 Two configurations were tested: small chunks (400/200) and large chunks (1200/50). Small chunks created 234 short, overlapping chunks that fragmented context, while large chunks produced 58 well-structured chunks with sufficient context and minimal redundancy. Therefore, the large chunk settings were chosen for better readability and more effective LLM processing.

In [6]:
# ========================================================================
# WORKSHOP ACTIVITY 4: EMBEDDINGS AND VECTOR STORE
# ========================================================================
# LEARNING OBJECTIVE: Convert text to vectors for semantic search

print("\n" + "="*50)
print("PART 4: VECTOR EMBEDDINGS & KNOWLEDGE BASE")
print("="*50)

from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain_community.vectorstores import Chroma
from langchain.docstore.document import Document

print("🧠 Initializing embedding model...")
embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
print("✅ Embedding model loaded")

# Create health documents
texts = [
    Document(page_content="Life's Essential 8: Eat better, be active, quit tobacco, get healthy sleep.", metadata={"source": "health_guide.pdf"}),
    Document(page_content="150 minutes moderate or 75 minutes vigorous activity per week.", metadata={"source": "health_guide.pdf"}),
    Document(page_content="Adults need 7-9 hours of sleep daily.", metadata={"source": "health_guide.pdf"}),
    Document(page_content="Eat more vegetables, fruits, whole grains.", metadata={"source": "health_guide.pdf"})
]

print("\n🗄️ Creating vector database...")
vectorstore = Chroma.from_documents(
    documents=texts,
    embedding=embeddings,
    persist_directory="./chroma_clinicaltrial_db"
)

print("✅ Vector database created and saved to disk")


PART 4: VECTOR EMBEDDINGS & KNOWLEDGE BASE
🧠 Initializing embedding model...


  embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")


✅ Embedding model loaded

🗄️ Creating vector database...
✅ Vector database created and saved to disk


In [19]:
# ========================================================================
# WORKSHOP ACTIVITY 5: RETRIEVAL CONFIGURATION
# ========================================================================
# LEARNING OBJECTIVE: Configure optimal document retrieval

print("\n" + "="*50)
print("PART 5: RETRIEVAL CONFIGURATION")
print("="*50)

# Create retriever with optimized settings
retriever = vectorstore.as_retriever(
    search_type="mmr",       # Use Maximum Marginal Relevance
    search_kwargs={
        "k": 3,              # Return top 3 chunks per query
        "fetch_k": 10,       # Consider top 10 candidates before MMR
        "lambda_mult": 0.5   # 0.5 relevance / 0.3 diversity
    }
)

print("🔍 Retrieval Configuration:")
print(f"   - Strategy: MMR")
print(f"   - Documents returned: 5")
print(f"   - Initial candidates: 10")
print(f"   - Relevance vs Diversity balance: 0.7")

# Test retrieval with 2 sample questions
questions = [
    "What are some tips for maintaining a healthy lifestyle?",
    "How much exercise should an adult get each week?"
]

for q in questions:
    results = retriever.get_relevant_documents(q)
    print(f"\nQuestion: {q}")
    for i, r in enumerate(results):
        print(f"Chunk {i+1} (Source: {r.metadata.get('source','Unknown')}):")
        print(f"  Preview: {r.page_content[:100]}...\n")



PART 5: RETRIEVAL CONFIGURATION
🔍 Retrieval Configuration:
   - Strategy: MMR
   - Documents returned: 5
   - Initial candidates: 10
   - Relevance vs Diversity balance: 0.7

Question: What are some tips for maintaining a healthy lifestyle?
Chunk 1 (Source: health_guide.pdf):
  Preview: Life's Essential 8: Eat better, be active, quit tobacco, get healthy sleep....

Chunk 2 (Source: health_guide.pdf):
  Preview: 150 minutes moderate or 75 minutes vigorous activity per week....

Chunk 3 (Source: valdoria-country-profile.pdf):
  Preview: Healthcare system modernization
Social inclusion programs
Youth retention strategies
Environmental G...


Question: How much exercise should an adult get each week?
Chunk 1 (Source: health_guide.pdf):
  Preview: 150 minutes moderate or 75 minutes vigorous activity per week....

Chunk 2 (Source: health_guide.pdf):
  Preview: Adults need 7-9 hours of sleep daily....

Chunk 3 (Source: health_guide.pdf):
  Preview: Eat more vegetables, fruits, whole grains..

In [20]:
# ========================================================================
# WORKSHOP ACTIVITY 6: LLM INTEGRATION
# ========================================================================
# LEARNING OBJECTIVE: Connect local language model for generation

print("\n" + "="*50)
print("PART 6: LANGUAGE MODEL SETUP")
print("="*50)

# PREREQUISITE: Install Ollama and pull a model
print("📋 PREREQUISITE CHECK:")
print("   1. Install Ollama: https://ollama.ai/")
print("   2. Run: ollama pull phi3:mini")
print("   3. Verify: ollama list")


try:
    llm = Ollama(
        model="phi3:mini",    # WORKSHOP NOTE: Lightweight model for laptops
        temperature=0.2,      # Low temperature = more deterministic responses
        num_thread=2,         # Adjust based on your CPU cores
    )
    
    # Test LLM connection
    print("\n🧪 Testing LLM connection...")
    test_response = llm.invoke("What is 2+2?")
    print(f"✅ LLM Response: {test_response}")
    print("✅ Language model initialized successfully!")
    
except Exception as e:
    print(f"❌ LLM Connection Failed: {e}")
    print("WORKSHOP TIP: Ensure Ollama is running and phi3:mini is installed")
    # TODO: Add fallback or alternative model suggestion



PART 6: LANGUAGE MODEL SETUP
📋 PREREQUISITE CHECK:
   1. Install Ollama: https://ollama.ai/
   2. Run: ollama pull phi3:mini
   3. Verify: ollama list

🧪 Testing LLM connection...
✅ LLM Response: The sum of 2 and 2 is 4. This simple arithmetic problem has a fixed answer, which can be easily calculated by adding the two numbers together. In this case:

2 + 2 = 4
✅ Language model initialized successfully!


In [21]:
# ========================================================================
# WORKSHOP ACTIVITY 7: PROMPT ENGINEERING
# ========================================================================
# LEARNING OBJECTIVE: Design prompts that enforce grounding

print("\n" + "="*50)
print("PART 7: PROMPT ENGINEERING FOR GROUNDING")
print("="*50)

# CONCEPT: Prompt engineering for RAG
# - Explicit instructions prevent hallucination
# - Structure ensures consistent output format
# - Citations enable verification

# Enhanced prompt template for better factual retrieval

prompt_template = """
You are a precise document analyst. Your task is to answer questions STRICTLY based on the provided context.

CRITICAL INSTRUCTIONS:
1. ONLY use information explicitly stated in the context below
2. If the context doesn't contain the answer, respond: "The provided documents do not contain information to answer this question."
3. Always cite which document/source your answer comes from
4. Do not make inferences beyond what is directly stated
5. If multiple sources contradict each other, mention the contradiction
6. Use exact quotes when possible, enclosed in quotation marks
7. For factual questions (like currency, population, etc.), scan ALL context carefully


Context Documents:
{context}

Question: {question}
Requirements for your answer:
- Start with the most relevant source
- Use direct quotes where applicable
- Clearly separate facts from different sources
- Look for keywords related to the question (currency, money, dollar, etc.)
- End with source citations

Answer:
"""


PROMPT = PromptTemplate(
    template=prompt_template, 
    input_variables=["context", "question"]
)
print("✅ Prompt template created with grounding instructions")


PART 7: PROMPT ENGINEERING FOR GROUNDING
✅ Prompt template created with grounding instructions


In [22]:
# ========================================================================
# WORKSHOP ACTIVITY 8: RAG CHAIN ASSEMBLY
# ========================================================================
# LEARNING OBJECTIVE: Combine all components into a working system

print("\n" + "="*50)
print("PART 8: RAG CHAIN ASSEMBLY")
print("="*50)

qa_chain = RetrievalQA.from_chain_type(
    llm=llm,
    chain_type="stuff",    # WORKSHOP NOTE: "stuff" = include all context in prompt
    retriever=retriever,
    chain_type_kwargs={
        "prompt": PROMPT,
        "document_separator": "\n\n--- SOURCE DOCUMENT ---\n\n"
    },
    return_source_documents=True,  # Essential for verification
    verbose=False  # WORKSHOP TIP: Set to True for debugging
)

print("✅ RAG chain assembled successfully!")
print("   Components connected: Retriever → LLM → Response")


PART 8: RAG CHAIN ASSEMBLY
✅ RAG chain assembled successfully!
   Components connected: Retriever → LLM → Response


In [24]:
# ========================================================================
# WORKSHOP ACTIVITY 9: ANSWER VALIDATION SYSTEM
# ========================================================================
# LEARNING OBJECTIVE: Implement quality control for RAG responses

def validate_answer(answer, source_docs):
    """
    WORKSHOP FUNCTION: Answer Quality Assessment
    
    PURPOSE: Detect potential hallucinations and assess grounding quality
    
    PARAMETERS:
    - answer: Generated response from RAG system
    - source_docs: Retrieved documents used for context
    
    RETURNS:
    - confidence_score: Float between 0.0 and 1.0
    - warnings: List of quality issues detected
    """
    answer_lower = answer.lower()
    
    # Define hallucination indicators
    # WORKSHOP EXERCISE: Add more phrases students might identify
    hallucination_phrases = [
        "i think", "probably", "likely", "it seems", "perhaps", 
        "generally speaking", "typically", "usually", "in most cases"
    ]
    
    confidence_score = 1.0
    warnings = []
    
    # Check for uncertain language
    for phrase in hallucination_phrases:
        if phrase in answer_lower:
            confidence_score -= 0.2
            warnings.append(f"Uncertain language detected: '{phrase}'")
    
    # Verify source citation
    has_citations = any(doc.metadata['source'].lower() in answer_lower for doc in source_docs)
    if not has_citations:
        confidence_score -= 0.3
        warnings.append("Answer does not reference source documents")
    
    return max(0.0, confidence_score), warnings

def ask_question_with_validation(question):
    """
    WORKSHOP FUNCTION: Complete RAG Query with Validation
    
    This function demonstrates the full RAG pipeline:
    1. Question input
    2. Document retrieval
    3. Answer generation
    4. Quality validation
    5. Source verification
    """
    print(f"🤔 Question: {question}")
    print("\n🔍 Retrieving relevant information...")
    
    # Execute RAG pipeline
    result = qa_chain.invoke({"query": question})
    answer = result["result"]
    source_docs = result["source_documents"]
    
    # Validate response quality
    confidence, warnings = validate_answer(answer, source_docs)
    
    # Display results with educational annotations
    print("\n📝 Answer:")
    print("="*50)
    print(answer)
    
    # Quality assessment
    print(f"\n📊 Quality Assessment:")
    print(f"   Confidence Score: {confidence:.2f}/1.0")
    
    if confidence >= 0.8:
        print("   ✅ HIGH QUALITY: Well-grounded response")
    elif confidence >= 0.6:
        print("   ⚠️  MEDIUM QUALITY: Review recommended")
    else:
        print("   ❌ LOW QUALITY: Potential hallucination detected")
    
    if warnings:
        print("\n⚠️  Quality Warnings:")
        for warning in warnings:
            print(f"   • {warning}")
    
    # Enhanced source verification with keyword analysis
    print(f"\n📚 Retrieved Sources ({len(source_docs)} documents):")
    print("-" * 60)
    
    question_keywords = set(question.lower().split())
    
    for i, doc in enumerate(source_docs):
        content_keywords = set(doc.page_content.lower().split())
        keyword_overlap = question_keywords.intersection(content_keywords)
        
        print(f"{i+1}. Source: {doc.metadata['source']}")
        print(f"   Page: {doc.metadata.get('page', 'Unknown')}")
        print(f"   Keyword overlap: {list(keyword_overlap)}")
        print(f"   Content: {doc.page_content[:200]}...")
        print()
    
    # Suggest improvements if answer is not found
    if "do not contain information" in answer.lower():
        print("\n💡 TROUBLESHOOTING SUGGESTIONS:")
        print("1. Check if your question keywords appear in the documents")
        print("2. Try rephrasing the question with different terms")
        print("3. Verify the PDF content was properly extracted")
        print("4. Consider if the information spans multiple chunks")
        
        # Try alternative search terms
        if "currency" in question.lower():
            alt_terms = ["money", "dollar", "economic", "financial", "payment"]
            print(f"\n🔄 Trying alternative search terms: {alt_terms}")
            for term in alt_terms:
                alt_docs = vectorstore.similarity_search(term, k=3)
                if alt_docs:
                    print(f"\n   Found content for '{term}':")
                    for doc in alt_docs[:1]:  # Show first match
                        print(f"   {doc.page_content[:100]}...")
    
    return result, confidence, warnings

In [16]:
# ========================================================================
# WORKSHOP ACTIVITY 10: HANDS-ON TESTING
# ========================================================================
# LEARNING OBJECTIVE: Test the complete RAG system

print("\n" + "="*80)
print("WORKSHOP DEMONSTRATION: TESTING THE RAG SYSTEM")
print("="*80)

# Sample question for demonstration
# WORKSHOP INSTRUCTION: Students should modify this question
question = "How much sleep do adults need?"

print("🧪 RUNNING SAMPLE QUERY...")
result, confidence, warnings = ask_question_with_validation(question)


WORKSHOP DEMONSTRATION: TESTING THE RAG SYSTEM
🧪 RUNNING SAMPLE QUERY...
🤔 Question: How much sleep do adults need?

🔍 Retrieving relevant information...

📝 Answer:
Adults need "7-9 hours of sleep daily" as stated in Life's Essential 8. This information is sourced from Source Document [Life's Essential 8]. No other document provides specific details on the required amount of adult sleep, thus we rely solely on this source for our answer. (No contradiction found)

📊 Quality Assessment:
   Confidence Score: 0.70/1.0
   ⚠️  MEDIUM QUALITY: Review recommended

   • Answer does not reference source documents

📚 Retrieved Sources (5 documents):
------------------------------------------------------------
1. Source: health_guide.pdf
   Page: Unknown
   Keyword overlap: ['sleep', 'adults']
   Content: Adults need 7-9 hours of sleep daily....

2. Source: health_guide.pdf
   Page: Unknown
   Keyword overlap: []
   Content: 150 minutes moderate or 75 minutes vigorous activity per week....

3. So

In [25]:
# ========================================================================
# WORKSHOP ACTIVITY 10: HANDS-ON TESTING
# ========================================================================
# LEARNING OBJECTIVE: Test the complete RAG system

print("\n" + "="*80)
print("WORKSHOP DEMONSTRATION: TESTING THE RAG SYSTEM")
print("="*80)

# Sample question for demonstration
# WORKSHOP INSTRUCTION: Students should modify this question
question = "How much physical activity should adults get per week?"

print("🧪 RUNNING SAMPLE QUERY...")
result, confidence, warnings = ask_question_with_validation(question)


WORKSHOP DEMONSTRATION: TESTING THE RAG SYSTEM
🧪 RUNNING SAMPLE QUERY...
🤔 Question: How much physical activity should adults get per week?

🔍 Retrieving relevant information...

📝 Answer:
According to Source Document [1], adults should get "75 minutes of vigorous activity per week" or alternatively "150 minutes of moderate activity." These recommendations are based on the guidelines provided in that document. There is no mention of physical activities related to currency, money, dollar, etc., as these terms do not appear within the context documents and therefore cannot be addressed herein.

References: [1]

📊 Quality Assessment:
   Confidence Score: 0.70/1.0
   ⚠️  MEDIUM QUALITY: Review recommended

   • Answer does not reference source documents

📚 Retrieved Sources (3 documents):
------------------------------------------------------------
1. Source: health_guide.pdf
   Page: Unknown
   Keyword overlap: ['activity', 'per']
   Content: 150 minutes moderate or 75 minutes vigorous a

In [17]:
# ========================================================================
# WORKSHOP CONCLUSION: INTERACTIVE SESSION
# ========================================================================

print("\n" + "="*80)
print("🎓 WORKSHOP COMPLETE! RAG SYSTEM READY FOR EXPERIMENTATION")
print("="*80)
print("\nEXPERIMENT IDEAS FOR STUDENTS:")
print("1. Try different chunk sizes (400, 800, 1200)")
print("2. Compare similarity vs MMR retrieval")
print("3. Adjust retrieval parameters (k, fetch_k, lambda_mult)")
print("4. Modify the prompt template")
print("5. Test with different types of questions")
print("6. Add your own validation criteria")
print("\n🔧 DEBUGGING TOOLS:")
print("- Use debug_retrieval(question, vectorstore) to see what's retrieved")
print("- Use manual_search('currency', vectorstore) to find specific terms")
print("- Check similarity scores to understand retrieval quality")
print("\nHAPPY LEARNING! 🚀")


🎓 WORKSHOP COMPLETE! RAG SYSTEM READY FOR EXPERIMENTATION

EXPERIMENT IDEAS FOR STUDENTS:
1. Try different chunk sizes (400, 800, 1200)
2. Compare similarity vs MMR retrieval
3. Adjust retrieval parameters (k, fetch_k, lambda_mult)
4. Modify the prompt template
5. Test with different types of questions
6. Add your own validation criteria

🔧 DEBUGGING TOOLS:
- Use debug_retrieval(question, vectorstore) to see what's retrieved
- Use manual_search('currency', vectorstore) to find specific terms
- Check similarity scores to understand retrieval quality

HAPPY LEARNING! 🚀
