**Notebook 03 was used iteratively for testing the RAG pipeline before production**

- ‚úÖ LLM client configured
- ‚úÖ Prompt templates defined  
- ‚úÖ Answer generator ready

# LLM Response Generation

**Why we're doing this:**
Load vector index and generate coherent answers using a language model.

**What we're doing:**

- Import RAG components
- Set up the LLM client (Groq/Llama)
- Create and test prompt template with a hybrid search function specified for our user queries
- Generating answers from retrieved context

In [19]:
# Import libraries
import os
from groq import Groq
from dotenv import load_dotenv
import os
import importlib
import sys
import json
from datetime import datetime


# Load environment variables
load_dotenv()

True

In [20]:
# Import RAG components

def get_correct_paths():
    """Get absolute paths - hardcoded based on your folder structure"""
    current_dir = os.getcwd()
    print(f"Current directory: {current_dir}")
    
    # Based on your screenshot, your structure is:
    # INNOVATION-1.../03_notebooks/rag_notebooks/rag_components
    
    # Try different approaches to find rag_components
    possible_paths = [
        # If notebook is in rag_notebooks folder
        os.path.join(current_dir, 'rag_components'),
        # If notebook is in a subfolder
        os.path.join(os.path.dirname(current_dir), 'rag_components'),
        # If we need to go up a level
        os.path.join(os.path.dirname(os.path.dirname(current_dir)), 'rag_components'),
    ]
    
    rag_components_path = None
    for path in possible_paths:
        if os.path.exists(path) and os.path.isdir(path):
            rag_components_path = path
            break
    
    if not rag_components_path:
        # Last resort: create relative path
        rag_components_path = os.path.join(current_dir, 'rag_components')
    
    # Find project root by looking for 04_models
    project_root = current_dir
    for i in range(3):  # Check up to 3 levels up
        if os.path.exists(os.path.join(project_root, '04_models')):
            break
        project_root = os.path.dirname(project_root)
    
    vector_index_path = os.path.join(project_root, '04_models', 'vector_index')
    
    return rag_components_path, vector_index_path, project_root

def import_your_components():
    """Import all RAG components"""
    rag_components_path, _, _ = get_correct_paths()
    
    # Check if rag_components directory exists
    if not os.path.exists(rag_components_path):
        return None, f"RAG components directory not found at: {rag_components_path}"
    
    try:
        # Add rag_components to sys.path if not already there
        if rag_components_path not in sys.path:
            sys.path.insert(0, rag_components_path)
        
        # 1. Import FAISS retriever
        retriever_path = os.path.join(rag_components_path, 'faiss_retriever.py')
        if not os.path.exists(retriever_path):
            return None, f"FAISS retriever not found at: {retriever_path}"
        
        spec = importlib.util.spec_from_file_location("faiss_retriever", retriever_path)
        retriever_module = importlib.util.module_from_spec(spec)
        spec.loader.exec_module(retriever_module)
        
        # 2. Import query_interface  
        query_interface_path = os.path.join(rag_components_path, 'query_interface.py')
        if not os.path.exists(query_interface_path):
            return None, f"Query interface not found at: {query_interface_path}"
        
        spec = importlib.util.spec_from_file_location("query_interface", query_interface_path)
        query_interface_module = importlib.util.module_from_spec(spec)
        spec.loader.exec_module(query_interface_module)
        
        # 3. Import answer_generator
        answer_generator_path = os.path.join(rag_components_path, 'answer_generator.py')
        if not os.path.exists(answer_generator_path):
            return None, f"Answer generator not found at: {answer_generator_path}"
        
        spec = importlib.util.spec_from_file_location("answer_generator", answer_generator_path)
        answer_generator_module = importlib.util.module_from_spec(spec)
        spec.loader.exec_module(answer_generator_module)
        
        # Return the actual classes, not the modules
        return (
            retriever_module.FAISSRetriever,  # This is the class name in faiss_retriever.py
            query_interface_module.SimpleQueryInterface if hasattr(query_interface_module, 'SimpleQueryInterface') else None,
            answer_generator_module.RAGAnswerGenerator if hasattr(answer_generator_module, 'RAGAnswerGenerator') else None
        ), None
        
    except Exception as e:
        return None, f"Error importing RAG components: {str(e)}"


# Usage in your notebook:
# Import the components
components_result = import_your_components()
if isinstance(components_result, tuple) and len(components_result) == 2:
    components, error = components_result
    if error:
        print(f"‚ùå Error: {error}")
    else:
        FAISSRetrieverClass, SimpleQueryInterfaceClass, RAGAnswerGeneratorClass = components
        print("üéâ ALL COMPONENTS IMPORTED SUCCESSFULLY!")
        
        # Get paths for initialization
        _, vector_index_path, _ = get_correct_paths()
        
        # Initialize components
        retriever = FAISSRetrieverClass(vector_index_path)
        query_interface = SimpleQueryInterfaceClass(retriever) if SimpleQueryInterfaceClass else None
        answer_generator = RAGAnswerGeneratorClass(query_interface) if RAGAnswerGeneratorClass else None
        
        print("‚úÖ Generation pipeline ready!")
else:
    print("‚ùå Unexpected return value from import_your_components()")

Current directory: /Users/siriamandaraaf/Documents/spiced-academy/04-course-material/spiced-academy-data-science/capstone-project/innovation-intelligence-suite/03_notebooks/rag_notebooks
üéâ ALL COMPONENTS IMPORTED SUCCESSFULLY!
Current directory: /Users/siriamandaraaf/Documents/spiced-academy/04-course-material/spiced-academy-data-science/capstone-project/innovation-intelligence-suite/03_notebooks/rag_notebooks
üîç Loading FAISS index from /Users/siriamandaraaf/Documents/spiced-academy/04-course-material/spiced-academy-data-science/capstone-project/innovation-intelligence-suite/04_models/vector_index
‚úì FAISS index loaded: 18717 vectors
‚úì Texts loaded: 18717 chunks
‚úì Metadata loaded: 18717 entries
‚úì Embedding model loaded: sentence-transformers/all-MiniLM-L6-v2
‚úì Template-based RAG answer generator initialized
‚úÖ Generation pipeline ready!


In [None]:
# Initialize Groq client

def setup_groq_client():
    """Client setup"""
    try:
        from groq import Groq
        from dotenv import load_dotenv
        load_dotenv()
        
        api_key = os.getenv('GROQ_API_KEY')
        if not api_key:
            return None, "GROQ_API_KEY not found in environment variables"
        
        client = Groq(api_key=api_key)
        return client, None
    except ImportError:
        return None, "Groq package not installed. Run: pip install groq"
    except Exception as e:
        return None, f"Error setting up Groq client: {str(e)}"

In [25]:
groq_client, error = setup_groq_client()
if error:
    print(f"‚ùå Failed to setup Groq client: {error}")
    print("Please ensure:")
    print("1. You have installed groq: pip install groq")
    print("2. You have a .env file with GROQ_API_KEY=your_key_here")
    print("3. You've restarted your notebook kernel after installing packages")
else:
    print("‚úÖ Groq client created successfully")

‚úÖ Groq client created successfully


In [None]:
# Test LLM Connection
# Why: Verify Groq API works and model responds correctly
# What: Send simple test query to confirm setup is functional
def test_llm_connection():
    try:
        response = groq_client.chat.completions.create(
            model="llama-3.1-8b-instant",  # Fast, free model for testing
            messages=[{"role": "user", "content": "Reply only with 'API connected'"}],
            max_tokens=10,
            temperature=0.1
        )
        print(f"‚úÖ LLM Connected: {response.choices[0].message.content}")
        return True
    except Exception as e:
        print(f"‚ùå LLM Failed: {e}")
        return False

test_llm_connection()

‚úÖ LLM Connected: API connected


True

In [None]:
# Integrate with Your Generator

def generate_with_llm(query, context):
    """Generate answer using Groq/Llama"""
    prompt = f"""
    Based on the following context, answer the user's question.
    
    Context: {context}
    
    Question: {query}
    
    Answer:
    """
    
    response = groq_client.chat.completions.create(
        model="llama3-8b-8192",
        messages=[{"role": "user", "content": prompt}],
        max_tokens=500,
        temperature=0.3
    )
    
    return response.choices[0].message.content

print("üöÄ LLM integration code ready!")

üöÄ LLM integration code ready!


In [None]:
# Prompt template

UNIVERSAL_PROMPT_TEMPLATE = """
CONTEXT:
{context}

USER QUESTION:
{question}

ANALYSIS INSTRUCTIONS:
1. Provide a comprehensive answer based strictly on the context provided
2. Cite specific sources for each key point using [Source: filename]
3. If the context is insufficient, acknowledge what cannot be answered

{trl_section}
{patent_section}
{startup_section}

ADDITIONAL GUIDELINES:
- For technology maturity questions: assess development stage and transition evidence
- For patent questions: consider jurisdiction and document type implications
- For trend questions: identify velocity, drivers, and key players  
- For forecasting: distinguish near-term vs long-term developments
- For descriptive questions: provide specific examples and entities

ANSWER:
"""

def build_smart_prompt(question, context):
    """Prompt template with specific guidance for all query types"""
    question_lower = question.lower()
    
    # Detect query type for targeted guidance
    is_startup_question = any(keyword in question_lower for keyword in 
                            ['startup', 'company', 'companies', 'venture', 'business', 'funding'])
    
    is_patent_question = any(keyword in question_lower for keyword in 
                           ['patent', 'intellectual property', 'ip', 'jurisdiction', 'ep', 'us', 'wo'])
    
    is_research_question = any(keyword in question_lower for keyword in 
                             ['research', 'study', 'paper', 'academic', 'scientific', 'methodology'])
    
    is_trend_question = any(keyword in question_lower for keyword in 
                          ['trend', 'forecast', 'future', 'emerging', 'development', 'innovation', 'pain point', 'challenge'])
    
    is_maturity_question = any(keyword in question_lower for keyword in 
                             ['trl', 'maturity', 'readiness', 'commercial', 'transition', 'stage'])
    
    is_technology_question = any(keyword in question_lower for keyword in 
                               ['technology', 'tech', 'system', 'solution', 'application', 'deployment', 'agent', 'agents'])
    
    # Build targeted guidance sections
    guidance_sections = []
    
    # Patents
    if is_patent_question:
        guidance_sections.append("""
üîç **PATENT QUERY GUIDANCE:**
1. **EXTRACT PATENT DETAILS**: Patent numbers, titles, inventors, assignees, jurisdictions
2. **ANALYZE JURISDICTIONS**: 
   - EP: European Patent Office (covers multiple countries)
   - US: United States Patent and Trademark Office
   - WO: World Intellectual Property Organization (international applications)
3. **IDENTIFY TECHNOLOGIES**: Specific automotive/AI technologies protected
4. **NOTE KEY DATES**: Filing dates, publication dates, grant dates when available
5. **ORGANIZE BY TYPE**: Group by jurisdiction or technology area
6. **SOURCE SPECIFICALLY**: Always cite patent database sources [Source: Automotive Technology Patents Database]
""")
    
    # Startups
    if is_startup_question:
        guidance_sections.append("""
üöÄ **STARTUP QUERY GUIDANCE:**
1. **EXTRACT COMPANY NAMES**: All startup/company names mentioned
2. **INCLUDE DETAILS**: Location, founding year, funding stage, key technologies
3. **FOCUS ON DATABASES**: Prioritize information from startup-specific sources
4. **ORGANIZE CLEARLY**: Create numbered lists with consistent formatting
5. **HIGHLIGHT AI FOCUS**: Note AI applications in automotive context
6. **CITE PROPERLY**: Always include source names
""")
    
    # Research
    if is_research_question:
        guidance_sections.append("""
üìö **RESEARCH QUERY GUIDANCE:**
1. **EXTRACT KEY FINDINGS**: Main conclusions, methodologies, results
2. **IDENTIFY AUTHORS & INSTITUTIONS**: Research teams and affiliations
3. **NOTE TECHNICAL DETAILS**: Specific algorithms, models, datasets used
4. **ASSESS NOVELTY**: Unique contributions or innovations mentioned
5. **CONNECT TO APPLICATIONS**: Practical automotive applications discussed
6. **ORGANIZE BY THEME**: Group related research findings together
""")
    
    # Trends
    if is_trend_question:
        guidance_sections.append("""
üìà **TREND/CHALLENGE GUIDANCE:**
1. **IDENTIFY KEY TRENDS/PAIN POINTS**: Major developments, challenges, or patterns
2. **EXTRACT VELOCITY INDICATORS**: Growth rates, adoption curves, investment trends
3. **NOTE DRIVERS & BARRIERS**: Factors enabling or hindering adoption
4. **HIGHLIGHT KEY PLAYERS**: Companies, institutions mentioned
5. **PROVIDE EXAMPLES**: Specific technologies or cases mentioned
6. **COMPARE SOURCES**: Note consistency or variations across different reports
""")
    
    # Maturity
    if is_maturity_question:
        guidance_sections.append("""
üéØ **TECHNOLOGY MATURITY GUIDANCE:**
1. **ASSESS TRL LEVELS**: Technology Readiness Levels 1-9 when mentioned
2. **IDENTIFY STAGE**: Research (TRL 1-4), Development (TRL 5-6), Commercial (TRL 7-9)
3. **NOTE TRANSITION POINTS**: Key milestones for advancement
4. **EXTRACT EVIDENCE**: Prototypes, pilots, deployments mentioned
5. **ANALYZE TIMELINES**: Expected development or adoption timelines
6. **PROVIDE SPECIFIC EXAMPLES**: Specific technologies and their maturity levels
""")
    
    # Technology
    if is_technology_question:
        guidance_sections.append("""
‚öôÔ∏è **TECHNOLOGY QUERY GUIDANCE:**
1. **EXTRACT SPECIFICS**: Technology names, versions, capabilities
2. **IDENTIFY APPLICATIONS**: How technologies are used in automotive context
3. **NOTE PERFORMANCE METRICS**: Speed, accuracy, efficiency improvements
4. **ASSESS INTEGRATION**: How technologies work together or integrate
5. **HIGHLIGHT INNOVATIONS**: Novel approaches or breakthroughs
6. **COMPARE ALTERNATIVES**: Different technology options mentioned
""")
    
    # General guidance
    general_guidance = """
üìã **GENERAL ANSWER GUIDELINES:**
1. **BE SPECIFIC**: Use exact names, numbers, dates from context
2. **BE COMPREHENSIVE**: Cover all relevant aspects of the question
3. **BE STRUCTURED**: Use clear organization (numbered lists, sections)
4. **BE ACCURATE**: Only use information from the provided context
5. **CITE SOURCES**: For each key point, include [Source: Name]
6. **ACKNOWLEDGE LIMITATIONS**: If information is incomplete, state what's missing
"""
    
    # Combine all guidance
    targeted_guidance = "\n\n".join(guidance_sections)
    
    prompt = f"""
CONTEXT:
{context}

USER QUESTION:
{question}

ANALYSIS INSTRUCTIONS:
You are an automotive technology intelligence analyst. Your task is to provide detailed, accurate answers based strictly on the context provided.

{targeted_guidance}

{general_guidance}

FORMAT REQUIREMENTS:
- Use **bold** for company names, technology names, patent numbers
- Use numbered lists for multiple items (e.g., 1., 2., 3.)
- Use bullet points for sub-items within descriptions
- Include specific metrics (percentages, amounts, dates) when available
- Group related information together (e.g., by technology, by company, by region)

ANSWER STRUCTURE:
1. Direct answer to the main question
2. Supporting details with specific examples
3. Source citations for each key point
4. Summary or implications if relevant

ANSWER:
"""
    return prompt

# Test the universal template
def test_universal_prompt():
    """Test that the template adapts to different question types"""
    
    test_context = "Sample context about technology development and patents..."
    
    print("üß™ TESTING UNIVERSAL PROMPT TEMPLATE:")
    print("=" * 50)
    
    # Test regular question
    regular_question = "Which startups work on AI for automotive?"
    regular_prompt = build_smart_prompt(regular_question, test_context)
    print("üîπ STARTUP QUESTION:")
    print(f"Question: {regular_question}")
    print("Includes TRL section:", "TECHNOLOGY MATURITY ASSESSMENT" in regular_prompt)
    print("Includes Patent section:", "PATENT DOCUMENT INTERPRETATION" in regular_prompt)
    print("Includes Startup section:", "STARTUP INFORMATION EXTRACTION" in regular_prompt)
    print("---")
    
    # Test TRL question  
    trl_question = "Which quantum computing research is moving from academy to application?"
    trl_prompt = build_smart_prompt(trl_question, test_context)
    print("üîπ TRL QUESTION:")
    print(f"Question: {trl_question}")
    print("Includes TRL section:", "TECHNOLOGY MATURITY ASSESSMENT" in trl_prompt)
    print("Includes Patent section:", "PATENT DOCUMENT INTERPRETATION" in trl_prompt)
    print("Includes Startup section:", "STARTUP INFORMATION EXTRACTION" in trl_prompt)
    print("---")
    
    # Test patent question
    patent_question = "What are the recent US patents in autonomous driving?"
    patent_prompt = build_smart_prompt(patent_question, test_context)
    print("üîπ PATENT QUESTION:")
    print(f"Question: {patent_question}")
    print("Includes TRL section:", "TECHNOLOGY MATURITY ASSESSMENT" in patent_prompt)
    print("Includes Patent section:", "PATENT DOCUMENT INTERPRETATION" in patent_prompt)
    print("Includes Startup section:", "STARTUP INFORMATION EXTRACTION" in patent_prompt)
    print("---")
    
    # Test combined question
    combined_question = "Which AI startups show commercial readiness with significant funding?"
    combined_prompt = build_smart_prompt(combined_question, test_context)
    print("üîπ COMBINED QUESTION:")
    print(f"Question: {combined_question}")
    print("Includes TRL section:", "TECHNOLOGY MATURITY ASSESSMENT" in combined_prompt)
    print("Includes Patent section:", "PATENT DOCUMENT INTERPRETATION" in combined_prompt)
    print("Includes Startup section:", "STARTUP INFORMATION EXTRACTION" in combined_prompt)
    
    return regular_prompt, trl_prompt, patent_prompt, combined_prompt

# Run test
regular_prompt, trl_prompt, patent_prompt, combined_prompt = test_universal_prompt()

print("\n" + "=" * 50)
print("‚úÖ Universal prompt template ready!")
print("‚úÖ Automatically includes TRL guidance for maturity questions")
print("‚úÖ Automatically includes patent definitions for IP questions") 
print("‚úÖ Automatically includes startup extraction for company questions")
print("‚úÖ Single template adapts to all query types")

üß™ TESTING UNIVERSAL PROMPT TEMPLATE:
üîπ STARTUP QUESTION:
Question: Which startups work on AI for automotive?
Includes TRL section: False
Includes Patent section: False
Includes Startup section: False
---
üîπ TRL QUESTION:
Question: Which quantum computing research is moving from academy to application?
Includes TRL section: False
Includes Patent section: False
Includes Startup section: False
---
üîπ PATENT QUESTION:
Question: What are the recent US patents in autonomous driving?
Includes TRL section: False
Includes Patent section: False
Includes Startup section: False
---
üîπ COMBINED QUESTION:
Question: Which AI startups show commercial readiness with significant funding?
Includes TRL section: False
Includes Patent section: False
Includes Startup section: False

‚úÖ Universal prompt template ready!
‚úÖ Automatically includes TRL guidance for maturity questions
‚úÖ Automatically includes patent definitions for IP questions
‚úÖ Automatically includes startup extraction for comp

# Response Quality Setup

**Why we're doing this:** 
Validate that answers are relevant and properly cite sources.

**What we're doing:**

- Checking if the pipeline works and our LLM integration and prompt template can answer our user queries


In [30]:
# Test User Queries with Dynamic Source Count & Startup Booster

def determine_source_count(question):
    """Dynamic source counting based on question type"""
    question_lower = question.lower()
    
    # Complex questions need more sources
    if any(keyword in question_lower for keyword in ['summarize', 'comprehensive', 'overall', 'complete', 'latest']):
        return 5
    # List questions need more sources for coverage
    elif any(keyword in question_lower for keyword in ['list', 'which', 'what are', 'show all', 'show me']):
        return 5
    # Specific questions can use fewer sources
    elif any(keyword in question_lower for keyword in ['specific', 'exact', 'precise', 'detailed']):
        return 3
    # Default for most questions
    else:
        return 4

def format_source_name(source_file):
    """Enhanced file name formatting with icons"""
    name_mapping = {
        # Automotive Papers
        'a_benchmark_framework_for_AL_models_in_automotive_aerodynamics.txt': 'üìä AI in Automotive Aerodynamics Research',
        'AL_agents_in_engineering_design_a_multiagent_framework_for_aesthetic_and_aerodynamic_car_design.txt': 'ü§ñ AI Agents in Car Design Research',
        'automating_automotive_software_development_a_synergy_of_generative_AL_and_formal_methods.txt': '‚öôÔ∏è AI for Automotive Software Development',
        'automotive-software-and-electronics-2030-full-report.txt': 'üìà Automotive Software 2030 Report',
        'drive_disfluency-rich_synthetic_dialog_data_generation_framework_for_intelligent_vehicle_environments.txt': 'üó£Ô∏è AI Dialogue Systems for Vehicles',
        'Embedded_acoustic_intelligence_for_automotive_systems.txt': 'üîä Acoustic AI for Automotive Systems',
        'enhanced_drift_aware_computer_vision_achitecture_for_autonomous_driving.txt': 'üëÅÔ∏è Computer Vision for Autonomous Driving',
        'Gen_AL_in_automotive_applications_challenges_and_opportunities_with_a_case_study_on_in-vehicle_experience.txt': 'üé® Generative AI in Automotive Applications',
        'generative_AL_for_autonomous_driving_a_review.txt': 'üìö Generative AI for Autonomous Driving Review',
        'leveraging_vision_language_models_for_visual_grounding_and_analysis_of_automative_UI.txt': 'üëÅÔ∏èüó£Ô∏è Vision-Language Models for Automotive UI',
        
        # Tech Reports
        'bog_ai_value_2025.txt': 'üè¢ BCG: AI Value Creation 2025',
        'mckinsey_tech_trends_2025.txt': 'üìä McKinsey Technology Trends 2025',
        'wef_emerging_tech_2025.txt': 'üåç WEF: Emerging Technologies 2025',
        
        # Processed Files
        'autotechinsight_startups_processed.txt': 'üöÄ AutoTechInsight Automotive Startup Profiles & Tracker',
        'seedtable_startups_processed.txt': 'üìà Seedtable Best Automotive Industry Startups to Watch in 2025',
        'automotive_papers_processed.txt': 'üìö Automotive Research Papers Database',
        'automotive_patents_processed.txt': 'üìú Automotive Technology Patents Database',
        
        # Generic fallbacks
        'startup': 'üöÄ Startup Database',
        'patent': 'üìú Patent Database',
        'paper': 'üìö Research Database',
        'report': 'üìä Industry Report',
    }
    
    # Try exact match first
    if source_file in name_mapping:
        return name_mapping[source_file]
    
    # Try partial matching
    source_lower = source_file.lower()
    for key, value in name_mapping.items():
        if key in source_lower:
            return value
    
    # Default formatting
    return source_file.replace('.txt', '').replace('_', ' ').title()


# Define user queries - UPDATED to include patent and automotive-specific questions
USER_QUERIES = {
    1: "Summarize the latest AI research on autonomous driving vehicles.",
    2: "Show me recent patents on AI for automotive vehicles.",
    3: "Which startups work on automotive and autonomous driving?",
    4: "What are the key challenges and pain points in automotive AI adoption?",
    5: "Summarize latest tech trends in development of AI agents.",
    6: "Which automotive technologies are reaching commercial maturity in the next 12 months?"
}

def test_complete_pipeline(question, query_id):
    """Test the full RAG pipeline with dynamic source count and startup boosting"""
    print(f"üß™ QUERY {query_id}: '{question}'")
    print("=" * 60)
    
    try:
        # Step 1: Determine optimal source count
        k = determine_source_count(question)
        print(f"1. üîç Retrieving documents (k={k})...")
        
        # Step 2: Retrieve documents
        retrieved_data = retriever.retrieve_with_sources(question, k=k)
        
        # üöÄ STARTUP BOOSTER: FORCE-INCLUDE startup files for startup-related queries
        startup_boost_applied = False
        if any(keyword in question.lower() for keyword in ['startup', 'company', 'venture', 'business', 'firm']):
            print("   üöÄ FORCING STARTUP FILES for this query...")
            
            # FIRST: Get startup-specific results with expanded query
            expanded_query = question + " automotive AI technology machine learning companies"
            startup_data = retriever.retrieve_with_sources(expanded_query, k=4)
            
            # Filter to ONLY include our startup files
            startup_items = []
            for item in startup_data:
                if any(startup_file in item['source_file'] for startup_file in ['autotechinsight_startups_processed.txt', 'seedtable_startups_processed.txt']):
                    # Check if this content is already in retrieved_data
                    is_duplicate = any(
                        item['content'][:100] == existing['content'][:100]  # Check first 100 chars for duplicates
                        for existing in retrieved_data
                    )
                    if not is_duplicate:
                        startup_items.append(item)
            
            # SECOND: If we still don't have enough startup results, force a generic search on startup files
            if len(startup_items) < 2:
                print("   üîç Force-searching startup files directly...")
                # Search specifically in startup files
                for startup_file in ['autotechinsight_startups_processed.txt', 'seedtable_startups_processed.txt']:
                    # Create a query that should match startup content
                    generic_startup_query = "automotive AI technology startup company"
                    force_results = retriever.retrieve_with_sources(generic_startup_query, k=3)
                    
                    for item in force_results:
                        if startup_file in item['source_file']:
                            # Check for duplicates
                            is_duplicate = any(
                                item['content'][:100] == existing['content'][:100]
                                for existing in retrieved_data + startup_items
                            )
                            if not is_duplicate:
                                startup_items.append(item)
            
            # Add startup items to the beginning of results
            if startup_items:
                # Take up to 2 startup items (prioritize them)
                startup_to_add = startup_items[:2]
                retrieved_data = startup_to_add + retrieved_data
                retrieved_data = retrieved_data[:k]  # Keep original k limit
                startup_boost_applied = True
                
                # Debug info
                startup_files = set(item['source_file'] for item in startup_to_add)
                print(f"   ‚úÖ FORCED {len(startup_to_add)} startup chunks into results from:")
                for file in startup_files:
                    readable = format_source_name(file)
                    count = sum(1 for item in startup_to_add if item['source_file'] == file)
                    print(f"      - {readable}: {count} chunks")
            else:
                print("   ‚ö†Ô∏è WARNING: Could not find any startup content despite forcing")     
        
        # üÜï PATENT BOOSTER: Enhance results for patent-related queries
        patent_boost_applied = False
        if any(keyword in question.lower() for keyword in ['patent', 'jurisdiction', 'ep', 'us', 'wo', 'intellectual property']):
            print("   üìú Boosting patents file for this query...")
            # Get additional results focusing on patents
            patent_data = retriever.retrieve_with_sources(question + " patents intellectual property", k=2)
            
            # Filter to only include patents file and avoid duplicates
            patent_items = []
            for item in patent_data:
                if 'automotive_patents_processed.txt' in item['source_file']:
                    # Check if this content is already in retrieved_data
                    is_duplicate = any(
                        item['content'] == existing['content'] 
                        for existing in retrieved_data
                    )
                    if not is_duplicate:
                        patent_items.append(item)
            
            # Add patent items to the beginning of results
            if patent_items:
                retrieved_data = patent_items + retrieved_data
                retrieved_data = retrieved_data[:k]  # Keep original k limit
                patent_boost_applied = True
                print(f"   ‚úÖ Added {len(patent_items)} patent-specific results")
        
        print(f"   ‚úÖ Found {len(retrieved_data)} relevant chunks")
        
        # Step 3: Format context with human-readable source names
        context = "\n\n".join([
            f"Source: {format_source_name(item['source_file'])} | Type: {item['doc_type']}\nContent: {item['content']}"
            for item in retrieved_data
        ])
        
        # Step 4: Build smart prompt (now includes patent definitions when needed)
        print("2. üìù Building prompt...")
        prompt = build_smart_prompt(question, context)
        
        # Step 5: Generate answer using LLM
        print("3. ü§ñ Generating answer with LLM...")
        response = groq_client.chat.completions.create(
            model="llama-3.1-8b-instant",
            messages=[{"role": "user", "content": prompt}],
            max_tokens=500,
            temperature=0.3
        )
        
        answer = response.choices[0].message.content
        
        # Step 6: Prepare results
        result = {
            'query_id': query_id,
            'question': question,
            'answer': answer,
            'sources': retrieved_data,
            'retrieved_chunks': len(retrieved_data),
            'source_count_used': k,
            'startup_boost_applied': startup_boost_applied,
            'patent_boost_applied': patent_boost_applied,  # üÜï Track if patent booster was used
            'timestamp': datetime.now().isoformat(),
            'model_used': 'llama-3.1-8b-instant'
        }
        
        # Display results
        print("4. üìä RESULTS:")
        print(f"ANSWER: {answer}")
        print(f"SOURCES: {len(retrieved_data)} documents (k={k})")
        
        # Show boost indicators
        boost_info = []
        if startup_boost_applied:
            boost_info.append("üöÄ Startup boost")
        if patent_boost_applied:
            boost_info.append("üìú Patent boost")
        if boost_info:
            print(f"   {' + '.join(boost_info)} applied")
            
        for i, item in enumerate(retrieved_data):
            readable_name = format_source_name(item['source_file'])
            # Add boost indicators to source listing
            boost_indicator = ""
            if any(startup_file in item['source_file'] for startup_file in ['autotechinsight_startups_processed.txt', 'seedtable_startups_processed.txt']) and startup_boost_applied:
                boost_indicator = "üöÄ "
            elif 'automotive_patents_processed.txt' in item['source_file'] and patent_boost_applied:
                boost_indicator = "üìú "
                
            print(f"   {i+1}. {boost_indicator}{readable_name} (Score: {item['similarity_score']:.3f})")
        
        print("‚úÖ Query completed successfully!\n")
        return result
        
    except Exception as e:
        print(f"‚ùå Pipeline error: {e}")
        import traceback
        traceback.print_exc()
        return None

# Create output directory
output_dir = "../../07_testsdemo/test_outputs/demo_results"
os.makedirs(output_dir, exist_ok=True)

# Test all queries
print("üöÄ TESTING ALL USER QUERIES WITH DYNAMIC SOURCE COUNT & MULTI-BOOSTER SYSTEM")
print("Note: Now includes patent boosting and updated query set\n")

all_results = []
successful_queries = 0

for query_id, question in USER_QUERIES.items():
    result = test_complete_pipeline(question, query_id)
    if result:
        all_results.append(result)
        successful_queries += 1
        
        # Save individual query result
        individual_file = f"{output_dir}/user_query_{query_id}_{datetime.now().strftime('%Y%m%d_%H%M')}.json"
        with open(individual_file, 'w', encoding='utf-8') as f:
            json.dump(result, f, indent=2, ensure_ascii=False)

# Save consolidated results
if all_results:
    consolidated_file = f"{output_dir}/all_user_queries_with_multi_boost_{datetime.now().strftime('%Y%m%d_%H%M')}.json"
    with open(consolidated_file, 'w', encoding='utf-8') as f:
        json.dump(all_results, f, indent=2, ensure_ascii=False)
    
    print("üéâ TESTING COMPLETE!")
    print(f"‚úÖ Successful queries: {successful_queries}/{len(USER_QUERIES)}")
    print(f"üìÅ Individual results saved to: {output_dir}/")
    print(f"üìä Consolidated results: {consolidated_file}")
    
    # Summary with source count and boost info
    print("\nüìà QUERY PERFORMANCE SUMMARY:")
    for result in all_results:
        boost_info = []
        if result['startup_boost_applied']:
            boost_info.append("üöÄ")
        if result['patent_boost_applied']:
            boost_info.append("üìú")
        boost_str = " " + "".join(boost_info) if boost_info else ""
        
        print(f"  Q{result['query_id']}: k={result['source_count_used']}, {len(result['sources'])} sources{boost_str}, {len(result['answer'])} chars")
        
else:
    print("üí• No queries completed successfully")

print(f"\nüìù Enhanced pipeline with patent definitions and multi-booster system ready!")

üöÄ TESTING ALL USER QUERIES WITH DYNAMIC SOURCE COUNT & MULTI-BOOSTER SYSTEM
Note: Now includes patent boosting and updated query set

üß™ QUERY 1: 'Summarize the latest AI research on autonomous driving vehicles.'
1. üîç Retrieving documents (k=5)...
üîç Retrieved 5 results for query: 'Summarize the latest AI research on autonomous driving vehicles.'
   üìú Boosting patents file for this query...
üîç Retrieved 2 results for query: 'Summarize the latest AI research on autonomous driving vehicles. patents intellectual property'
   ‚úÖ Found 5 relevant chunks
2. üìù Building prompt...
3. ü§ñ Generating answer with LLM...
4. üìä RESULTS:
ANSWER: **Summary of the Latest AI Research on Autonomous Driving Vehicles**

The latest AI research on autonomous driving vehicles highlights significant advancements in the field, with a focus on AI-driven innovations, challenges, and future pathways. Key findings and trends include:

1. **AI-Driven Autonomous Vehicles: A Comprehensive Review 