# Specialized Processing for each Data Type

Note: Overview of data sources /01_data/rag_automotive_tech/metadata.json. All processed documents will be saved in 01/data/rag_automotive_tech/processed

** Startup Dataset Processing**

In [43]:
!pip install pandas

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Defaulting to user installation because normal site-packages is not writeable


In [44]:
pip install --upgrade pip

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Defaulting to user installation because normal site-packages is not writeable
Note: you may need to restart the kernel to use updated packages.


In [4]:
from pathlib import Path
import pandas as pd
import csv

In [5]:
def process_all_startups(csv_path, output_path):
    """Process all startups without filtering"""
    
    # Check if CSV file exists
    csv_file = Path(csv_path)
    if not csv_file.exists():
        print(f"‚ùå ERROR: CSV file not found at: {csv_file.absolute()}")
        return False
    
    print(f"‚úÖ Found CSV file: {csv_file}")
    
    try:
        # Read the entire CSV
        with open(csv_path, 'r', encoding='utf-8') as csvfile:
            csv_reader = csv.DictReader(csvfile)
            startups = list(csv_reader)
        
        print(f"üìä Total startups loaded: {len(startups)}")
        print(f"üìã Columns available: {csv_reader.fieldnames}")
        
        # Create output directory
        output_file = Path(output_path)
        output_file.parent.mkdir(parents=True, exist_ok=True)
        
        # Save ALL startups to .txt file
        with open(output_path, 'w', encoding='utf-8') as f:
            f.write("COMPLETE STARTUPS DATASET\n")
            f.write(f"Total startups: {len(startups)}\n")
            f.write(f"Source: {csv_path}\n")
            f.write("="*60 + "\n\n")
            
            for i, startup in enumerate(startups, 1):
                f.write(f"STARTUP #{i}:\n")
                # Safely get each field with fallback to 'N/A'
                f.write(f"  Name: {startup.get('startup-name', startup.get('name', 'N/A'))}\n")
                f.write(f"  Location: {startup.get('location', 'N/A')}\n")
                f.write(f"  Tagline: {startup.get('tagline', 'N/A')}\n")
                f.write(f"  Description: {startup.get('description', 'N/A')}\n")
                f.write("-" * 50 + "\n\n")
        
        print(f"‚úÖ Successfully saved ALL {len(startups)} startups to: {output_path}")
        return True
        
    except Exception as e:
        print(f"‚ùå Error: {e}")
        return False

# Usage
csv_path = "../01_data/rag_automotive_tech/raw_sources/startups/startups_worldwide.csv"
output_path = "../01_data/rag_automotive_tech/processed/startups_processed.txt"

# Run the function
success = process_all_startups(csv_path, output_path)

if success:
    print("\nüéâ All startups processed successfully!")
else:
    print("\nüí• Processing failed!")

‚úÖ Found CSV file: ../01_data/rag_automotive_tech/raw_sources/startups/startups_worldwide.csv
üìä Total startups loaded: 42038
üìã Columns available: ['name', 'city', 'tagline', 'description']
‚úÖ Successfully saved ALL 42038 startups to: ../01_data/rag_automotive_tech/processed/startups_processed.txt

üéâ All startups processed successfully!


**Tech Reports Processing**

In [6]:
from pathlib import Path
import PyPDF2
import re

def extract_pdf_text(pdf_path):
    """Extract text from PDF file"""
    try:
        with open(pdf_path, 'rb') as file:
            reader = PyPDF2.PdfReader(file)
            text = ""
            for page in reader.pages:
                text += page.extract_text() + "\n"
            return text
    except Exception as e:
        print(f"‚ùå Error reading PDF {pdf_path}: {e}")
        return ""

def clean_report_text(text):
    """Clean and format report text"""
    # Remove excessive whitespace
    text = re.sub(r'\n\s*\n', '\n\n', text)
    text = re.sub(r'[ \t]+', ' ', text)
    
    # Remove page numbers and headers
    text = re.sub(r'Page \d+ of \d+', '', text)
    text = re.sub(r'\n\d+\n', '\n', text)
    
    return text.strip()

# Universal patterns that work across most professional reports
universal_patterns = {
    'executive_summary': [
        r'executive summary',
        r'key insights', 
        r'in brief',
        r'highlights',
        r'overview'
    ],
    'introduction': [
        r'introduction',
        r'foreword',
        r'preface',
        r'background',
        r'context'
    ],
    'methodology': [
        r'methodology',
        r'approach',
        r'research design',
        r'data sources',
        r'analytical framework'
    ],
    'conclusions': [
        r'conclusion',
        r'summary',
        r'key takeaways',
        r'implications',
        r'closing remarks'
    ],
    'references': [
        r'references',
        r'endnotes',
        r'citations',
        r'bibliography',
        r'sources'
    ],
    'appendix': [
        r'appendix',
        r'supplemental',
        r'additional data',
        r'supporting information'
    ]
}

# WEF-Specific Patterns (Technology & Foresight Focus)
wef_specific_patterns = {
    'technology_catalog': [
        r'\d{2}\s+[\w\s]+',  # Matches "01 Structural battery composites"
        r'technolog(y|ies)',
        r'emerging technologies',
        r'top \d+',
        r'structural battery composites',
        r'osmotic power systems',
        r'advanced nuclear technologies',
        r'engineered living therapeutics',
        r'GLP[-\s]?1s',
        r'autonomous biochemical sensing',
        r'green nitrogen fixation',
        r'nanozymes',
        r'collaborative sensing',
        r'generative watermarking'
    ],
    'strategic_outlook': [
        r'strategic outlook',
        r'future implications',
        r'transformative potential',
        r'societal transformation',
        r'strategic implications',
        r'forward-looking'
    ],
    'ecosystem_readiness': [
        r'ecosystem readiness',
        r'readiness map',
        r'stee?p analysis',
        r'social.*technological.*environmental.*economic.*policy',
        r'key actions to achieve scale',
        r'readiness assessment'
    ],
    'foresight_framework': [
        r'strategic foresight',
        r'future trends',
        r'megatrends',
        r'weak signals',
        r'future scenarios',
        r'from weak signals to'
    ],
    'technology_definition': [
        r'what are',
        r'definition of',
        r'overview of',
        r'technical background'
    ]
}

# McKinsey-Specific Patterns (Business & Consulting Focus)
mckinsey_specific_patterns = {
    'business_case': [
        r'business case',
        r'value proposition',
        r'roi',
        r'economic impact',
        r'financial benefits',
        r'cost.*benefit',
        r'value creation',
        r'investment case'
    ],
    'market_analysis': [
        r'market analysis',
        r'industry outlook',
        r'market size',
        r'growth projections',
        r'competitive landscape',
        r'market dynamics',
        r'industry trends'
    ],
    'case_studies': [
        r'case stud(y|ies)',
        r'client examples',
        r'implementation examples',
        r'success stories',
        r'use cases',
        r'client experience',
        r'example.*company'
    ],
    'implementation_roadmap': [
        r'implementation',
        r'roadmap',
        r'action plan',
        r'next steps',
        r'path forward',
        r'adoption strategy',
        r'deployment',
        r'rollout'
    ],
    'risk_assessment': [
        r'risks',
        r'challenges',
        r'barriers',
        r'mitigation',
        r'considerations',
        r'risk factors',
        r'obstacles',
        r'hurdles'
    ],
    'recommendations': [
        r'recommendations',
        r'key actions',
        r'strategic priorities',
        r'calls to action',
        r'proposed actions',
        r'way forward'
    ],
    'financial_analysis': [
        r'financial analysis',
        r'revenue potential',
        r'profitability',
        r'margin impact',
        r'financial model',
        r'economic model'
    ],
    'organizational_impact': [
        r'organizational impact',
        r'talent implications',
        r'workforce',
        r'capabilities',
        r'skills',
        r'operational impact'
    ]
}

# BCG-Specific Patterns (Consulting & Research Focus)
bcg_specific_patterns = {
    'research_findings': [
        r'research findings',
        r'study results', 
        r'empirical proof',
        r'survey results',
        r'data shows',
        r'our research'
    ],
    'exhibits_data': [
        r'exhibit \d+',
        r'figure \d+',
        r'table \d+',
        r'data exhibit',
        r'research data',
        r'self-reported impact'
    ],
    'value_gap_analysis': [
        r'value gap',
        r'future-built',
        r'laggards',
        r'stagnating',
        r'emerging',
        r'scaling',
        r'maturity curve',
        r'virtuous cycle',
        r'vicious cycle'
    ],
    'case_studies_examples': [
        r'for example',
        r'case in point',
        r'consider the case',
        r'global leader in',
        r'major.*company',
        r'leading.*firm'
    ],
    'implementation_framework': [
        r'playbook',
        r'framework',
        r'10-20-70',
        r'operating model',
        r'strategic priorities',
        r'roadmap',
        r'maturity score'
    ],
    'ai_definitions': [
        r'ai definitions',
        r'methodology',
        r'survey methodology',
        r'maturity categories',
        r'value realization pathways'
    ],
    'sector_analysis': [
        r'by sector',
        r'by region', 
        r'industry analysis',
        r'maturity varies',
        r'regional differences'
    ]
}

# Main function to get appropriate patterns
def get_section_patterns(report_source, content_preview=""):
    source_lower = report_source.lower()
    content_preview_lower = content_preview.lower()
    
    # Detect report type
    if any(keyword in source_lower for keyword in ['weforum', 'world economic forum', 'emerging technologies']):
        return {**universal_patterns, **wef_specific_patterns}
    elif any(keyword in source_lower for keyword in ['mckinsey', 'company', 'business review']):
        return {**universal_patterns, **mckinsey_specific_patterns}
    elif any(keyword in source_lower for keyword in ['bcg', 'boston consulting']):
        return {**universal_patterns, **bcg_specific_patterns}
    # Content-based fallback detection
    elif any(keyword in content_preview_lower for keyword in ['technology', 'emerging', 'innovation', 'future']):
        return {**universal_patterns, **wef_specific_patterns}
    elif any(keyword in content_preview_lower for keyword in ['business', 'roi', 'market', 'strategy', 'growth']):
        return {**universal_patterns, **mckinsey_specific_patterns}
    elif any(keyword in content_preview_lower for keyword in ['value gap', 'future-built', 'exhibit', 'research findings']):
        return {**universal_patterns, **bcg_specific_patterns}
    else:
        return universal_patterns  # fallback to universal only

def extract_key_sections(text, report_name):
    """Extract important sections from reports"""
    # Get appropriate patterns based on report type
    section_patterns = get_section_patterns(report_name, text[:1000])  # Use first 1000 chars for detection
    
    sections = {
        'executive_summary': '',
        'key_findings': '',
        'main_conclusions': '',
        'methodology': '',
        'introduction': '',
        'technology_catalog': '',
        'strategic_outlook': '',
        'business_case': '',
        'recommendations': ''
    }
    
    lines = text.split('\n')
    current_section = 'introduction'
    section_content = []
    
    for line in lines:
        line_lower = line.lower().strip()
        
        # Check if this line starts a new section
        section_found = False
        for section_name, patterns in section_patterns.items():
            for pattern in patterns:
                if re.search(pattern, line_lower) and len(line) < 100:  # Avoid matching body text
                    if section_content:
                        sections[current_section] = '\n'.join(section_content)
                    current_section = section_name
                    section_content = [line]  # Include the section header
                    section_found = True
                    break
            if section_found:
                break
        
        if not section_found:
            section_content.append(line)
    
    # Add the last section
    if section_content:
        sections[current_section] = '\n'.join(section_content)
    
    return sections

def process_tech_report(pdf_path, output_path):
    """Process a single technology report"""
    
    pdf_file = Path(pdf_path)
    if not pdf_file.exists():
        print(f"‚ùå PDF file not found: {pdf_file.absolute()}")
        return False
    
    print(f"üìÑ Processing: {pdf_file.name}")
    
    try:
        # Extract text from PDF
        raw_text = extract_pdf_text(pdf_path)
        if not raw_text:
            print(f"‚ùå Could not extract text from {pdf_file.name}")
            return False
        
        print(f"   Extracted {len(raw_text)} characters")
        
        # Clean the text
        clean_text = clean_report_text(raw_text)
        
        # Extract key sections
        sections = extract_key_sections(clean_text, pdf_file.name)
        
        # Create output directory
        output_file = Path(output_path)
        output_file.parent.mkdir(parents=True, exist_ok=True)
        
        # Save processed report
        with open(output_path, 'w', encoding='utf-8') as f:
            f.write(f"TECH REPORT: {pdf_file.stem}\n")
            f.write(f"SOURCE: {pdf_file.name}\n")
            f.write("="*60 + "\n\n")
            
            # Write key sections first
            f.write("## KEY SECTIONS ##\n\n")
            for section_name, section_content in sections.items():
                if section_content and len(section_content) > 50:  # Only include substantial sections
                    f.write(f"### {section_name.upper().replace('_', ' ')} ###\n")
                    f.write(section_content)
                    f.write("\n\n" + "-"*50 + "\n\n")
            
            # Write full content
            f.write("## FULL REPORT CONTENT ##\n\n")
            f.write(clean_text)
        
        print(f"‚úÖ Saved: {output_path}")
        return True
        
    except Exception as e:
        print(f"‚ùå Error processing {pdf_file.name}: {e}")
        return False

def process_all_tech_reports():
    """Process all three technology reports"""
    
    reports = [
        {
            'name': 'WEF Emerging Technologies 2025',
            'input': '../01_data/rag_automotive_tech/raw_sources/tech_reports/wef_emerging_technologies_2025.pdf',
            'output': '../01_data/rag_automotive_tech/processed/tech_reports/wef_emerging_tech_2025.txt'
        },
        {
            'name': 'McKinsey Technology Trends 2025', 
            'input': '../01_data/rag_automotive_tech/raw_sources/tech_reports/mckinsey_tech_trends_2025.pdf',
            'output': '../01_data/rag_automotive_tech/processed/tech_reports/mckinsey_tech_trends_2025.txt'
        },
        {
            'name': 'BCG AI Value 2025',
            'input': '../01_data/rag_automotive_tech/raw_sources/tech_reports/bcg_ai_value_2025.pdf',
            'output': '../01_data/rag_automotive_tech/processed/tech_reports/bcg_ai_value_2025.txt'
        }
    ]
    
    print("üöÄ PROCESSING TECH REPORTS")
    print("=" * 50)
    
    success_count = 0
    for report in reports:
        success = process_tech_report(report['input'], report['output'])
        if success:
            success_count += 1
        print()  # Empty line between reports
    
    print(f"üìä Completed: {success_count}/{len(reports)} reports processed successfully")
    return success_count == len(reports)

# Install required package if needed
try:
    import PyPDF2
except ImportError:
    print("Installing PyPDF2...")
    import subprocess
    import sys
    subprocess.check_call([sys.executable, "-m", "pip", "install", "PyPDF2"])
    import PyPDF2

# Run the processing
if __name__ == "__main__":
    success = process_all_tech_reports()
    if success:
        print("\nüéâ All tech reports processed successfully!")
    else:
        print("\n‚ö†Ô∏è Some reports failed to process. Check the errors above.")

üöÄ PROCESSING TECH REPORTS
üìÑ Processing: wef_emerging_technologies_2025.pdf
   Extracted 143886 characters
‚úÖ Saved: ../01_data/rag_automotive_tech/processed/tech_reports/wef_emerging_tech_2025.txt

üìÑ Processing: mckinsey_tech_trends_2025.pdf
   Extracted 310965 characters
‚úÖ Saved: ../01_data/rag_automotive_tech/processed/tech_reports/mckinsey_tech_trends_2025.txt

üìÑ Processing: bcg_ai_value_2025.pdf
   Extracted 64762 characters
‚úÖ Saved: ../01_data/rag_automotive_tech/processed/tech_reports/bcg_ai_value_2025.txt

üìä Completed: 3/3 reports processed successfully

üéâ All tech reports processed successfully!


**Automotive Papers Processing**

In [49]:
def process_automotive_reports(pdf_path, output_path):
    """Process a single technology report"""
    
    pdf_file = Path(pdf_path)
    if not pdf_file.exists():
        print(f"‚ùå PDF file not found: {pdf_file.absolute()}")
        return False
    
    print(f"üìÑ Processing: {pdf_file.name}")
    
    try:
        # Extract text from PDF
        raw_text = extract_pdf_text(pdf_path)
        if not raw_text:
            print(f"‚ùå Could not extract text from {pdf_file.name}")
            return False
        
        print(f"   Extracted {len(raw_text)} characters")
        
        # Clean the text
        clean_text = clean_report_text(raw_text)
        
        # Extract key sections
        sections = extract_key_sections(clean_text, pdf_file.name)
        
        # Create output directory
        output_file = Path(output_path)
        output_file.parent.mkdir(parents=True, exist_ok=True)
        
        # Save processed report
        with open(output_path, 'w', encoding='utf-8') as f:
            f.write(f"TECH REPORT: {pdf_file.stem}\n")
            f.write(f"SOURCE: {pdf_file.name}\n")
            f.write("="*60 + "\n\n")
            
            # Write key sections first
            f.write("## KEY SECTIONS ##\n\n")
            for section_name, section_content in sections.items():
                if section_content and len(section_content) > 50:  # Only include substantial sections
                    f.write(f"### {section_name.upper().replace('_', ' ')} ###\n")
                    f.write(section_content)
                    f.write("\n\n" + "-"*50 + "\n\n")
            
            # Write full content
            f.write("## FULL REPORT CONTENT ##\n\n")
            f.write(clean_text)
        
        print(f"‚úÖ Saved: {output_path}")
        return True
        
    except Exception as e:
        print(f"‚ùå Error processing {pdf_file.name}: {e}")
        return False

def process_all_automative_reports():
    """Process all three technology reports"""
    
    reports = [
        {
            'name': 'A_benchmark_framework_for_AI_models_in_automative_aerodynamics',
            'input': '../01_data/rag_automotive_tech/raw_sources/automotive_papers/a_benchmark_framework_for_AI_models_in_automative_aerodynamics.pdf',
            'output': '../01_data/rag_automotive_tech/processed/automotive_papers/a_benchmark_framework_for_AI_models_in_automative_aerodynamics.txt'
        },
        {
            'name': 'AI_agents_in_engineering_design_a_multiagent_framework_for_aesthetic_and_aerodynamic_car_design',
            'input': '../01_data/rag_automotive_tech/raw_sources/automotive_papers/AI_agents_in_engineering_design_a_multiagent_framework_for_aesthetic_and_aerodynamic_car_design.pdf',
            'output': '../01_data/rag_automotive_tech/processed/automotive_papers/AI_agents_in_engineering_design_a_multiagent_framework_for_aesthetic_and_aerodynamic_car_design.txt'
        },
        {
            'name': 'automating_automative_software_development_a_synergy_of_generative_AI_and_formal_methods',
            'input': '../01_data/rag_automotive_tech/raw_sources/automotive_papers/automating_automative_software_development_a_synergy_of_generative_AI_and_formal_methods.pdf',
            'output': '../01_data/rag_automotive_tech/processed/automotive_papers/automating_automative_software_development_a_synergy_of_generative_AI_and_formal_methods.txt'
        },
        {
            'name': 'automotive-software-and-electronics-2030-full-report',
            'input': '../01_data/rag_automotive_tech/raw_sources/automotive_papers/automotive-software-and-electronics-2030-full-report.pdf',
            'output': '../01_data/rag_automotive_tech/processed/automotive_papers/automotive-software-and-electronics-2030-full-report.txt'
        },
        {
            'name': 'drive_disfluency-rich_synthetic_dialog_data_generation_framework_for_intelligent_vehicle_environments',
            'input': '../01_data/rag_automotive_tech/raw_sources/automotive_papers/drive_disfluency-rich_synthetic_dialog_data_generation_framework_for_intelligent_vehicle_environments.pdf',
            'output': '../01_data/rag_automotive_tech/processed/automotive_papers/drive_disfluency-rich_synthetic_dialog_data_generation_framework_for_intelligent_vehicle_environments.txt'
        },
        {
            'name': 'Embedded_acoustic_intelligence_for_automotive_systems',
            'input': '../01_data/rag_automotive_tech/raw_sources/automotive_papers/Embedded_acoustic_intelligence_for_automotive_systems.pdf',
            'output': '../01_data/rag_automotive_tech/processed/automotive_papers/Embedded_acoustic_intelligence_for_automotive_systems.txt'
        },
        {
            'name': 'enhanced_drift_aware_computer_vision_achitecture_for_autonomous_driving',
            'input': '../01_data/rag_automotive_tech/raw_sources/automotive_papers/enhanced_drift_aware_computer_vision_achitecture_for_autonomous_driving.pdf',
            'output': '../01_data/rag_automotive_tech/processed/automotive_papers/enhanced_drift_aware_computer_vision_achitecture_for_autonomous_driving.txt'
        },
        {
            'name': 'Gen_AI_in_automotive_applications_challenges_and_opportunities_with_a_case_study_on_in-vehicle_experience',
            'input': '../01_data/rag_automotive_tech/raw_sources/automotive_papers/Gen_AI_in_automotive_applications_challenges_and_opportunities_with_a_case_study_on_in-vehicle_experience.pdf',
            'output': '../01_data/rag_automotive_tech/processed/automotive_papers/Gen_AI_in_automotive_applications_challenges_and_opportunities_with_a_case_study_on_in-vehicle_experience.txt'
        },
        {
            'name': 'generative_AI_for_autonomous_driving_a_review',
            'input': '../01_data/rag_automotive_tech/raw_sources/automotive_papers/generative_AI_for_autonomous_driving_a_review.pdf',
            'output': '../01_data/rag_automotive_tech/processed/automotive_papers/generative_AI_for_autonomous_driving_a_review.txt'
        },
        {
            'name': 'leveraging_vision_language_models_for_visual_grounding_and_analysis_of_automative_UI',
            'input': '../01_data/rag_automotive_tech/raw_sources/automotive_papers/leveraging_vision_language_models_for_visual_grounding_and_analysis_of_automative_UI.pdf',
            'output': '../01_data/rag_automotive_tech/processed/automotive_papers/leveraging_vision_language_models_for_visual_grounding_and_analysis_of_automative_UI.txt'
        }
    ]
    
    print("üöÄ PROCESSING AUTOMOTIVE REPORTS")
    print("=" * 50)
    
    success_count = 0
    for report in reports:
        success = process_automotive_reports(report['input'], report['output'])
        if success:
            success_count += 1
        print()  # Empty line between reports
    
    print(f"üìä Completed: {success_count}/{len(reports)} reports processed successfully")
    return success_count == len(reports)

# Install required package if needed
try:
    import PyPDF2
except ImportError:
    print("Installing PyPDF2...")
    import subprocess
    import sys
    subprocess.check_call([sys.executable, "-m", "pip", "install", "PyPDF2"])
    import PyPDF2

# Run the processing
if __name__ == "__main__":
    success = process_all_automative_reports()
    if success:
        print("\nüéâ All automative reports processed successfully!")
    else:
        print("\n‚ö†Ô∏è Some reports failed to process. Check the errors above.")

üöÄ PROCESSING AUTOMOTIVE REPORTS
üìÑ Processing: a_benchmark_framework_for_AI_models_in_automative_aerodynamics.pdf
   Extracted 50659 characters
‚úÖ Saved: ../01_data/rag_automotive_tech/processed/automotive_papers/a_benchmark_framework_for_AI_models_in_automative_aerodynamics.txt

üìÑ Processing: AI_agents_in_engineering_design_a_multiagent_framework_for_aesthetic_and_aerodynamic_car_design.pdf
   Extracted 71766 characters
‚úÖ Saved: ../01_data/rag_automotive_tech/processed/automotive_papers/AI_agents_in_engineering_design_a_multiagent_framework_for_aesthetic_and_aerodynamic_car_design.txt

üìÑ Processing: automating_automative_software_development_a_synergy_of_generative_AI_and_formal_methods.pdf
   Extracted 43844 characters
‚úÖ Saved: ../01_data/rag_automotive_tech/processed/automotive_papers/automating_automative_software_development_a_synergy_of_generative_AI_and_formal_methods.txt

üìÑ Processing: automotive-software-and-electronics-2030-full-report.pdf
   Extracted 9200