**Notebook 03 is a "run-once" setup**

- üìù NOTEBOOK 3 - SETUP ONLY
- ‚úÖ LLM client configured
- ‚úÖ Prompt templates defined  
- ‚úÖ Answer generator ready

No files saved - this notebook only needs to run once per session

# LLM Response Generation

**Why we're doing this:**
 Take retrieved document chunks and generate coherent answers using a language model.

**What we're doing:**

- Setting up first prototype - done
- Setting up the LLM client (Groq/Llama) - done
- Creating prompt templates for TRL questions - done
- Generating answers from retrieved context - done 

In [18]:
# PERMANENT WORKING IMPORT - USE THIS EVERYWHERE
import sys
import os
import importlib.util

def import_rag_components():
    """Import RAG components"""
    current_dir = os.getcwd()
    
    # Import retriever
    retriever_path = os.path.join(current_dir, 'rag_components', 'retriever.py')
    spec = importlib.util.spec_from_file_location("retriever", retriever_path)
    retriever_module = importlib.util.module_from_spec(spec)
    spec.loader.exec_module(retriever_module)
    
    # Import query_interface  
    query_interface_path = os.path.join(current_dir, 'rag_components', 'query_interface.py')
    spec = importlib.util.spec_from_file_location("query_interface", query_interface_path)
    query_interface_module = importlib.util.module_from_spec(spec)
    spec.loader.exec_module(query_interface_module)
    
    # Import answer_generator
    answer_generator_path = os.path.join(current_dir, 'rag_components', 'answer_generator.py')
    spec = importlib.util.spec_from_file_location("answer_generator", answer_generator_path)
    answer_generator_module = importlib.util.module_from_spec(spec)
    spec.loader.exec_module(answer_generator_module)
    
    return (retriever_module.DocumentAwareRetriever, 
            query_interface_module.SimpleQueryInterface,
            answer_generator_module.RAGAnswerGenerator)

# Import the components
DocumentAwareRetriever, SimpleQueryInterface, RAGAnswerGenerator = import_rag_components()
print("üéâ COMPONENTS IMPORTED SUCCESSFULLY!")

# Continue with code
VECTOR_INDEX_PATH = "../../04_models/vector_index"
retriever = DocumentAwareRetriever(VECTOR_INDEX_PATH)
query_interface = SimpleQueryInterface(retriever)
answer_generator = RAGAnswerGenerator(query_interface)
print("‚úÖ Generation pipeline ready!")

FileNotFoundError: [Errno 2] No such file or directory

In [None]:
pip install groq

Defaulting to user installation because normal site-packages is not writeable
Note: you may need to restart the kernel to use updated packages.


In [None]:
# CELL: LLM Client Setup
import os
from groq import Groq
from dotenv import load_dotenv

# Load environment variables
load_dotenv()

# Initialize Groq client
def setup_groq_client():
    """Set up and return Groq client with error handling"""
    api_key = os.getenv('GROQ_API_KEY')
    
    if not api_key:
        raise ValueError("‚ùå GROQ_API_KEY not found in environment variables")
    
    client = Groq(api_key=api_key)
    print("‚úÖ Groq client initialized successfully")
    return client

# Test the client
try:
    groq_client = setup_groq_client()
    print("üéâ LLM client ready for integration!")
except Exception as e:
    print(f"‚ùå Failed to initialize LLM client: {e}")

‚úÖ Groq client initialized successfully
üéâ LLM client ready for integration!


In [None]:
# CELL: Test LLM Connection
# Why: Verify Groq API works and model responds correctly
# What: Send simple test query to confirm setup is functional
def test_llm_connection():
    try:
        response = groq_client.chat.completions.create(
            model="llama-3.1-8b-instant",  # Fast, free model for testing
            messages=[{"role": "user", "content": "Reply only with 'API connected'"}],
            max_tokens=10,
            temperature=0.1
        )
        print(f"‚úÖ LLM Connected: {response.choices[0].message.content}")
        return True
    except Exception as e:
        print(f"‚ùå LLM Failed: {e}")
        return False

test_llm_connection()

‚úÖ LLM Connected: API connected


True

In [None]:
# CELL: Integrate with Your Generator
def generate_with_llm(query, context):
    """Generate answer using Groq/Llama"""
    prompt = f"""
    Based on the following context, answer the user's question.
    
    Context: {context}
    
    Question: {query}
    
    Answer:
    """
    
    response = groq_client.chat.completions.create(
        model="llama3-8b-8192",
        messages=[{"role": "user", "content": prompt}],
        max_tokens=500,
        temperature=0.3
    )
    
    return response.choices[0].message.content

print("üöÄ LLM integration code ready!")

üöÄ LLM integration code ready!


In [None]:
# CELL: Universal Prompt Template with Patent Definitions
# Why: Single template that adapts to TRL, patent, and regular queries automatically
# What: Smart template that detects when to include maturity analysis AND patent definitions

UNIVERSAL_PROMPT_TEMPLATE = """
CONTEXT:
{context}

USER QUESTION:
{question}

ANALYSIS INSTRUCTIONS:
1. Provide a comprehensive answer based strictly on the context provided
2. Cite specific sources for each key point using [Source: filename]
3. If the context is insufficient, acknowledge what cannot be answered

{trl_section}
{patent_section}
{startup_section}

ADDITIONAL GUIDELINES:
- For technology maturity questions: assess development stage and transition evidence
- For patent questions: consider jurisdiction and document type implications
- For trend questions: identify velocity, drivers, and key players  
- For forecasting: distinguish near-term vs long-term developments
- For descriptive questions: provide specific examples and entities

ANSWER:
"""

def build_smart_prompt(question, context):
    """Build adaptive prompt that includes TRL and patent guidance only when needed"""
    
    # Detect if this is a technology maturity question
    maturity_keywords = ['trl', 'mature', 'transition', 'academy to application', 
                        'commercial', 'moving from academy', 'readiness', 'development stage']
    
    # Detect if this is a patent-related question
    patent_keywords = ['patent', 'intellectual property', 'ip', 'jurisdiction', 'ep', 'us', 'wo',
                      'kind', 'a1', 'b2', 'filing', 'protection', 'patent office', 'lens']
    
    # Detect if this is a startup-related question
    startup_keywords = ['startup', 'startups', 'company', 'companies', 'venture', 'business', 
                       'funding', 'investment', 'series a', 'series b', 'series c', 'backed']
    
    question_lower = question.lower()
    is_maturity_question = any(keyword in question_lower for keyword in maturity_keywords)
    is_patent_question = any(keyword in question_lower for keyword in patent_keywords)
    is_startup_question = any(keyword in question_lower for keyword in startup_keywords)
    
    # Include TRL section only for maturity questions
    if is_maturity_question:
        trl_section = """
TECHNOLOGY MATURITY ASSESSMENT:
- When discussing technology readiness, reference these stages:
  * Research Phase (TRL 1-4): Basic research, lab validation
  * Development Phase (TRL 5-6): Prototyping, testing  
  * Commercialization Phase (TRL 7-9): Deployment, scaling
- Assess current stage based on evidence in context
- Identify transition indicators and timelines
"""
    else:
        trl_section = ""
    
    # Include patent definitions only for patent questions
    if is_patent_question:
        patent_section = """
PATENT DOCUMENT INTERPRETATION:
- JURISDICTION indicates geographic protection scope:
  * EP: European Patent Office (multiple European countries)
  * US: United States Patent and Trademark Office
  * WO: World Intellectual Property Organization (PCT international applications)
  
- KIND CODES indicate document type and status:
  * A1: Patent application with search report
  * A2: Patent application without search report  
  * A3: Search report published separately
  * B1: Granted patent (examined and approved)
  * B2: Amended/revised granted patent
  
- Consider jurisdiction for market focus and protection scope
- Use kind codes to distinguish between applications (A) and granted patents (B)
"""
    else:
        patent_section = ""
    
    # Include startup guidance only for startup questions
    if is_startup_question:
        startup_section = """
STARTUP INFORMATION EXTRACTION:
- When startup data is available in context, extract and list specific startup names
- For each startup mentioned, include:
  * Primary focus area or technology specialization
  * Location and key business details
  * Funding status if available (funding rounds, money raised)
  * Notable products or services
- Prioritize information from startup profiles and company databases
- Group startups by technology focus areas when possible
- If startup information is present but not directly answering the question, still mention relevant startups
"""
    else:
        startup_section = ""
    
    prompt = UNIVERSAL_PROMPT_TEMPLATE.format(
        context=context,
        question=question,
        trl_section=trl_section,
        patent_section=patent_section,
        startup_section=startup_section
    )
    
    return prompt

# Test the universal template
def test_universal_prompt():
    """Test that the template adapts to different question types"""
    
    test_context = "Sample context about technology development and patents..."
    
    print("üß™ TESTING UNIVERSAL PROMPT TEMPLATE:")
    print("=" * 50)
    
    # Test regular question
    regular_question = "Which startups work on AI for automotive?"
    regular_prompt = build_smart_prompt(regular_question, test_context)
    print("üîπ STARTUP QUESTION:")
    print(f"Question: {regular_question}")
    print("Includes TRL section:", "TECHNOLOGY MATURITY ASSESSMENT" in regular_prompt)
    print("Includes Patent section:", "PATENT DOCUMENT INTERPRETATION" in regular_prompt)
    print("Includes Startup section:", "STARTUP INFORMATION EXTRACTION" in regular_prompt)
    print("---")
    
    # Test TRL question  
    trl_question = "Which quantum computing research is moving from academy to application?"
    trl_prompt = build_smart_prompt(trl_question, test_context)
    print("üîπ TRL QUESTION:")
    print(f"Question: {trl_question}")
    print("Includes TRL section:", "TECHNOLOGY MATURITY ASSESSMENT" in trl_prompt)
    print("Includes Patent section:", "PATENT DOCUMENT INTERPRETATION" in trl_prompt)
    print("Includes Startup section:", "STARTUP INFORMATION EXTRACTION" in trl_prompt)
    print("---")
    
    # Test patent question
    patent_question = "What are the recent US patents in autonomous driving?"
    patent_prompt = build_smart_prompt(patent_question, test_context)
    print("üîπ PATENT QUESTION:")
    print(f"Question: {patent_question}")
    print("Includes TRL section:", "TECHNOLOGY MATURITY ASSESSMENT" in patent_prompt)
    print("Includes Patent section:", "PATENT DOCUMENT INTERPRETATION" in patent_prompt)
    print("Includes Startup section:", "STARTUP INFORMATION EXTRACTION" in patent_prompt)
    print("---")
    
    # Test combined question
    combined_question = "Which AI startups show commercial readiness with significant funding?"
    combined_prompt = build_smart_prompt(combined_question, test_context)
    print("üîπ COMBINED QUESTION:")
    print(f"Question: {combined_question}")
    print("Includes TRL section:", "TECHNOLOGY MATURITY ASSESSMENT" in combined_prompt)
    print("Includes Patent section:", "PATENT DOCUMENT INTERPRETATION" in combined_prompt)
    print("Includes Startup section:", "STARTUP INFORMATION EXTRACTION" in combined_prompt)
    
    return regular_prompt, trl_prompt, patent_prompt, combined_prompt

# Run test
regular_prompt, trl_prompt, patent_prompt, combined_prompt = test_universal_prompt()

print("\n" + "=" * 50)
print("‚úÖ Universal prompt template ready!")
print("‚úÖ Automatically includes TRL guidance for maturity questions")
print("‚úÖ Automatically includes patent definitions for IP questions") 
print("‚úÖ Automatically includes startup extraction for company questions")
print("‚úÖ Single template adapts to all query types")

üß™ TESTING UNIVERSAL PROMPT TEMPLATE:
üîπ STARTUP QUESTION:
Question: Which startups work on AI for automotive?
Includes TRL section: False
Includes Patent section: True
Includes Startup section: True
---
üîπ TRL QUESTION:
Question: Which quantum computing research is moving from academy to application?
Includes TRL section: True
Includes Patent section: False
Includes Startup section: False
---
üîπ PATENT QUESTION:
Question: What are the recent US patents in autonomous driving?
Includes TRL section: False
Includes Patent section: True
Includes Startup section: False
---
üîπ COMBINED QUESTION:
Question: Which AI startups show commercial readiness with significant funding?
Includes TRL section: True
Includes Patent section: False
Includes Startup section: True

‚úÖ Universal prompt template ready!
‚úÖ Automatically includes TRL guidance for maturity questions
‚úÖ Automatically includes patent definitions for IP questions
‚úÖ Automatically includes startup extraction for company qu

# Response Quality Setup

**Why we're doing this:** 
Ensure answers are relevant and properly cite sources.

**What we're doing:**

- Checking if the pipeline works and our LLM integration and prompt template can return something nice. 


In [None]:
# CELL: Test All User Queries with Dynamic Source Count & Startup Booster
# Why: Validate pipeline performance with intelligent source retrieval and startup boosting
# What: Run all 8 user queries with dynamic k-value and startup file enhancement

import json
import os
from datetime import datetime

def determine_source_count(question):
    """Dynamically determine how many sources to retrieve based on question type"""
    question_lower = question.lower()
    
    if any(keyword in question_lower for keyword in ['summarize', 'trends', 'overview', 'comprehensive']):
        return 5  # More sources for comprehensive questions
    elif any(keyword in question_lower for keyword in ['which', 'list', 'show me']):
        return 4  # Medium for listing questions
    elif any(keyword in question_lower for keyword in ['specific', 'exact', 'precise']):
        return 2  # Fewer for very specific questions
    else:
        return 3  # Default

def format_source_name(source_file):
    """Convert file names to human-readable format for better UX"""
    name_mapping = {
        # Automotive Papers
        'a_benchmark_framework_for_AL_models_in_automotive_aerodynamics.txt': 'Benchmark Framework for AI Models in Automotive Aerodynamics',
        'AL_agents_in_engineering_design_a_multiagent_framework_for_aesthetic_and_aerodynamic_car_design.txt': 'AI Agents in Engineering Design: Multiagent Framework for Car Design',
        'automating_automotive_software_development_a_synergy_of_generative_AL_and_formal_methods.txt': 'Automating Automotive Software Development: Generative AI and Formal Methods',
        'automotive-software-and-electronics-2030-full-report.txt': 'Automotive Software and Electronics 2030 Report',
        'drive_disfluency-rich_synthetic_dialog_data_generation_framework_for_intelligent_vehicle_environments.txt': 'DRIVE Framework: Synthetic Dialog Data for Intelligent Vehicles',
        'Embedded_acoustic_intelligence_for_automotive_systems.txt': 'Embedded Acoustic Intelligence for Automotive Systems',
        'enhanced_drift_aware_computer_vision_achitecture_for_autonomous_driving.txt': 'Enhanced Drift-Aware Computer Vision for Autonomous Driving',
        'Gen_AL_in_automotive_applications_challenges_and_opportunities_with_a_case_study_on_in-vehicle_experience.txt': 'Generative AI in Automotive: Applications and Challenges',
        'generative_AL_for_autonomous_driving_a_review.txt': 'Generative AI for Autonomous Driving: A Review',
        'leveraging_vision_language_models_for_visual_grounding_and_analysis_of_automative_UI.txt': 'Vision-Language Models for Automotive UI Analysis',
        
        # Tech Reports
        'bog_ai_value_2025.txt': 'Boston Consulting Group: AI Value Creation 2025',
        'mckinsey_tech_trends_2025.txt': 'McKinsey Technology Trends Outlook 2025',
        'wef_emerging_tech_2025.txt': 'World Economic Forum: Emerging Technologies 2025',
        
        # New Processed Files
        'autotechinsight_startups_processed.txt': 'Automotive Startup Profiles & Tracker',
        'seedtable_startups_processed.txt': 'Automotive Industry Startups to Watch in 2025',
        'automotive_papers_processed.txt': 'Automotive Research Papers Database',
        'automotive_patents_processed.txt': 'Automotive Technology Patents Database',
    }
    return name_mapping.get(source_file, source_file.replace('.txt', '').replace('_', ' ').title())

# Define user queries - UPDATED to include patent and automotive-specific questions
USER_QUERIES = {
    1: "Which startups work on AI for automotive?",
    2: "Summarize the latest research on autonomous driving.",
    3: "What are the latest tech trends in development of AI agents",
    4: "Summarize the key pain points/use cases in automotive AI.",
    5: "Show me recent patents on AI for automotive.",
    6: "Which technologies are likely to mature next year?",
    7: "Which AI research topics in automotive are growing fastest?",
    8: "Which automotive technologies are moving from academy to application?"
}

def test_complete_pipeline(question, query_id):
    """Test the full RAG pipeline with dynamic source count and startup boosting"""
    print(f"üß™ QUERY {query_id}: '{question}'")
    print("=" * 60)
    
    try:
        # Step 1: Determine optimal source count
        k = determine_source_count(question)
        print(f"1. üîç Retrieving documents (k={k})...")
        
        # Step 2: Retrieve documents
        retrieved_data = retriever.retrieve_with_sources(question, k=k)
        
             # üöÄ STARTUP BOOSTER: Enhance results for startup-related queries
        startup_boost_applied = False
        if any(keyword in question.lower() for keyword in ['startup', 'company', 'venture', 'business']):
            print("   üöÄ Boosting startups file for this query...")
            # Get additional results focusing on startups
            startup_data = retriever.retrieve_with_sources(question + " startups companies", k=2)
            
            # Filter to only include our specific startup files and avoid duplicates
            startup_items = []
            for item in startup_data:
                if any(startup_file in item['source_file'] for startup_file in ['autotechinsight_startups_processed.txt', 'seedtable_startups_processed.txt']):
                    # Check if this content is already in retrieved_data
                    is_duplicate = any(
                        item['content'] == existing['content'] 
                        for existing in retrieved_data
                    )
                    if not is_duplicate:
                        startup_items.append(item)
            
            # Add startup items to the beginning of results
            if startup_items:
                retrieved_data = startup_items + retrieved_data
                retrieved_data = retrieved_data[:k]  # Keep original k limit
                startup_boost_applied = True
                print(f"   ‚úÖ Added {len(startup_items)} startup-specific results from: {', '.join(set(item['source_file'] for item in startup_items))}")
        
        # üÜï PATENT BOOSTER: Enhance results for patent-related queries
        patent_boost_applied = False
        if any(keyword in question.lower() for keyword in ['patent', 'jurisdiction', 'ep', 'us', 'wo', 'intellectual property']):
            print("   üìú Boosting patents file for this query...")
            # Get additional results focusing on patents
            patent_data = retriever.retrieve_with_sources(question + " patents intellectual property", k=2)
            
            # Filter to only include patents file and avoid duplicates
            patent_items = []
            for item in patent_data:
                if 'automotive_patents_processed.txt' in item['source_file']:
                    # Check if this content is already in retrieved_data
                    is_duplicate = any(
                        item['content'] == existing['content'] 
                        for existing in retrieved_data
                    )
                    if not is_duplicate:
                        patent_items.append(item)
            
            # Add patent items to the beginning of results
            if patent_items:
                retrieved_data = patent_items + retrieved_data
                retrieved_data = retrieved_data[:k]  # Keep original k limit
                patent_boost_applied = True
                print(f"   ‚úÖ Added {len(patent_items)} patent-specific results")
        
        print(f"   ‚úÖ Found {len(retrieved_data)} relevant chunks")
        
        # Step 3: Format context with human-readable source names
        context = "\n\n".join([
            f"Source: {format_source_name(item['source_file'])} | Type: {item['doc_type']}\nContent: {item['content']}"
            for item in retrieved_data
        ])
        
        # Step 4: Build smart prompt (now includes patent definitions when needed)
        print("2. üìù Building prompt...")
        prompt = build_smart_prompt(question, context)
        
        # Step 5: Generate answer using LLM
        print("3. ü§ñ Generating answer with LLM...")
        response = groq_client.chat.completions.create(
            model="llama-3.1-8b-instant",
            messages=[{"role": "user", "content": prompt}],
            max_tokens=500,
            temperature=0.3
        )
        
        answer = response.choices[0].message.content
        
        # Step 6: Prepare results
        result = {
            'query_id': query_id,
            'question': question,
            'answer': answer,
            'sources': retrieved_data,
            'retrieved_chunks': len(retrieved_data),
            'source_count_used': k,
            'startup_boost_applied': startup_boost_applied,
            'patent_boost_applied': patent_boost_applied,  # üÜï Track if patent booster was used
            'timestamp': datetime.now().isoformat(),
            'model_used': 'llama-3.1-8b-instant'
        }
        
        # Display results
        print("4. üìä RESULTS:")
        print(f"ANSWER: {answer}")
        print(f"SOURCES: {len(retrieved_data)} documents (k={k})")
        
        # Show boost indicators
        boost_info = []
        if startup_boost_applied:
            boost_info.append("üöÄ Startup boost")
        if patent_boost_applied:
            boost_info.append("üìú Patent boost")
        if boost_info:
            print(f"   {' + '.join(boost_info)} applied")
            
        for i, item in enumerate(retrieved_data):
            readable_name = format_source_name(item['source_file'])
            # Add boost indicators to source listing
            boost_indicator = ""
            if 'startups_processed.txt' in item['source_file'] and startup_boost_applied:
                boost_indicator = "üöÄ "
            elif 'automotive_patents_processed.txt' in item['source_file'] and patent_boost_applied:
                boost_indicator = "üìú "
                
            print(f"   {i+1}. {boost_indicator}{readable_name} (Score: {item['similarity_score']:.3f})")
        
        print("‚úÖ Query completed successfully!\n")
        return result
        
    except Exception as e:
        print(f"‚ùå Pipeline error: {e}")
        import traceback
        traceback.print_exc()
        return None

# Create output directory
output_dir = "../../07_testsdemo/test_outputs/demo_results"
os.makedirs(output_dir, exist_ok=True)

# Test all queries
print("üöÄ TESTING ALL USER QUERIES WITH DYNAMIC SOURCE COUNT & MULTI-BOOSTER SYSTEM")
print("Note: Now includes patent boosting and updated query set\n")

all_results = []
successful_queries = 0

for query_id, question in USER_QUERIES.items():
    result = test_complete_pipeline(question, query_id)
    if result:
        all_results.append(result)
        successful_queries += 1
        
        # Save individual query result
        individual_file = f"{output_dir}/user_query_{query_id}_{datetime.now().strftime('%Y%m%d_%H%M')}.json"
        with open(individual_file, 'w', encoding='utf-8') as f:
            json.dump(result, f, indent=2, ensure_ascii=False)

# Save consolidated results
if all_results:
    consolidated_file = f"{output_dir}/all_user_queries_with_multi_boost_{datetime.now().strftime('%Y%m%d_%H%M')}.json"
    with open(consolidated_file, 'w', encoding='utf-8') as f:
        json.dump(all_results, f, indent=2, ensure_ascii=False)
    
    print("üéâ TESTING COMPLETE!")
    print(f"‚úÖ Successful queries: {successful_queries}/{len(USER_QUERIES)}")
    print(f"üìÅ Individual results saved to: {output_dir}/")
    print(f"üìä Consolidated results: {consolidated_file}")
    
    # Summary with source count and boost info
    print("\nüìà QUERY PERFORMANCE SUMMARY:")
    for result in all_results:
        boost_info = []
        if result['startup_boost_applied']:
            boost_info.append("üöÄ")
        if result['patent_boost_applied']:
            boost_info.append("üìú")
        boost_str = " " + "".join(boost_info) if boost_info else ""
        
        print(f"  Q{result['query_id']}: k={result['source_count_used']}, {len(result['sources'])} sources{boost_str}, {len(result['answer'])} chars")
        
else:
    print("üí• No queries completed successfully")

print(f"\nüìù Enhanced pipeline with patent definitions and multi-booster system ready!")

üöÄ TESTING ALL USER QUERIES WITH DYNAMIC SOURCE COUNT & MULTI-BOOSTER SYSTEM
Note: Now includes patent boosting and updated query set

üß™ QUERY 1: 'Which startups work on AI for automotive?'
1. üîç Retrieving documents (k=4)...
   üöÄ Boosting startups file for this query...
   üìú Boosting patents file for this query...
   ‚úÖ Found 4 relevant chunks
2. üìù Building prompt...
3. ü§ñ Generating answer with LLM...
4. üìä RESULTS:
ANSWER: Based on the provided context, I will attempt to answer the question about startups working on AI for automotive.

Unfortunately, the context does not directly mention any specific startups working on AI for automotive. However, I can provide some insights based on the research papers and reports provided.

From the research paper "Gen Ai In Automotive Applications Challenges And Opportunities With A Case Study On In-Vehicle Experience," there is no mention of specific startups. However, it does discuss the applications and challenges of gener

In [None]:
# Add this right after the context is built in your test_complete_pipeline function:

        # Step 3: Format context with human-readable source names
        context = "\n\n".join([
            f"Source: {format_source_name(item['source_file'])} | Type: {item['doc_type']}\nContent: {item['content']}"
            for item in retrieved_data
        ])
        
        # üÜï DEBUG: Check what startup content is actually in the context
        print("   üîç DEBUG: Checking startup content in context...")
        startup_sources = [item for item in retrieved_data if any(startup_file in item['source_file'] for startup_file in ['autotechinsight_startups_processed.txt', 'seedtable_startups_processed.txt'])]
        print(f"   üìä Startup documents in context: {len(startup_sources)}")
        for i, startup_doc in enumerate(startup_sources):
            content_preview = startup_doc['content'][:200] + "..." if len(startup_doc['content']) > 200 else startup_doc['content']
            print(f"      {i+1}. {startup_doc['source_file']}: {content_preview}")
        
        # Step 4: Build smart prompt (now includes patent definitions when needed)
        print("2. üìù Building prompt...")

IndentationError: unexpected indent (461264955.py, line 4)