In [1]:
!conda install nvidia/label/cuda-12.1.1::cuda-nvcc -y
!pip install transformers datasets peft trl accelerate kaggle packaging ninja tf-keras
!pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu121
!pip install requests networkx alpha_vantage redis sec_api cachetools
!pip install torch_geometric torch_sparse torch_scatter torch_cluster -f https://data.pyg.org/whl/torch-2.5.0+cu121.html
!pip install torch-geometric-temporal==0.54.0
!pip install scikit-learn matplotlib faiss-cpu yfinance qdrant-client seaborn
!pip install graphrag graspologic
!pip install langgraph langchain langchain-core

Channels:
 - conda-forge
 - nvidia/label/cuda-12.1.1
Platform: linux-64
Collecting package metadata (repodata.json): done
Solving environment: done


    current version: 23.11.0
    latest version: 25.7.0

Please update conda by running

    $ conda update -n base -c conda-forge conda



## Package Plan ##

  environment location: /opt/conda

  added / updated specs:
    - nvidia/label/cuda-12.1.1::cuda-nvcc


The following packages will be downloaded:

    package                    |            build
    ---------------------------|-----------------
    ca-certificates-2025.8.3   |       hbd8a1cb_0         151 KB  conda-forge
    certifi-2025.8.3           |     pyhd8ed1ab_0         155 KB  conda-forge
    cuda-nvcc-12.1.105         |                0        52.4 MB  nvidia/label/cuda-12.1.1
    libgcc-15.1.0              |       h767d61c_4         805 KB  conda-forge
    libgcc-ng-15.1.0           |       h69a702a_4          29 KB  conda-forge
    libgomp-15.1.0             |      

In [2]:
!pip install flash-attn --no-build-isolation

Collecting flash-attn
  Using cached flash_attn-2.8.3-cp311-cp311-linux_x86_64.whl
Collecting einops (from flash-attn)
  Using cached einops-0.8.1-py3-none-any.whl.metadata (13 kB)
Using cached einops-0.8.1-py3-none-any.whl (64 kB)
Installing collected packages: einops, flash-attn
Successfully installed einops-0.8.1 flash-attn-2.8.3


In [None]:
# 
# UNIFIED FINANCIAL AI SYSTEM WITH GRAPH EMBEDDINGS AND AGENTIC REASONING
# 

# Standard library imports
import os
import re
import io
import json
import pickle
import logging
import time
import asyncio
import sqlite3
from datetime import datetime, timedelta
from concurrent.futures import ThreadPoolExecutor, as_completed
from dataclasses import dataclass
from typing import Dict, List, Optional, Tuple, Any, Union, TypedDict, Annotated, Literal
from functools import partial

from langgraph.graph import StateGraph, START, END
from langgraph.graph.message import add_messages
from langchain_core.messages import BaseMessage, AIMessage, HumanMessage



# Third-party imports
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch_geometric.data import Data
from torch_geometric.nn import SAGEConv, global_mean_pool
from torch_geometric.loader import NeighborLoader
import torch_geometric.transforms as T
from torch import Tensor

from torch_geometric.nn import HeteroConv, GATv2Conv

import uuid
import numpy as np
from datetime import datetime
import pandas as pd
import networkx as nx
from networkx.algorithms.community import louvain_communities
import requests
import zipfile

import matplotlib.pyplot as plt
from typing import Dict, List, Optional, Any

from sklearn.manifold import TSNE
from sklearn.metrics.pairwise import cosine_similarity

# ML and NLP libraries
from transformers import AutoModelForCausalLM, AutoTokenizer, TrainingArguments, AutoModel
from peft import get_peft_model, PeftModel
from trl import SFTTrainer
from datasets import Dataset, load_dataset
import faiss



# Storage and caching
from qdrant_client import QdrantClient
from qdrant_client.models import (
    VectorParams, Distance, PointStruct, Filter, FieldCondition, 
    Range, MatchValue, PayloadSchemaType
)
from cachetools import LRUCache, TTLCache, LFUCache

import random; random.seed(42); np.random.seed(42); torch.manual_seed(42)


# Optional SEC API
try:
    from sec_api import ExtractorApi
except ImportError:
    ExtractorApi = None
import yfinance as yf


# Configure logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

# Replace device_map="auto" with specific device
device = "cuda" if torch.cuda.is_available() else "cpu"

import os
os.environ["TOKENIZERS_PARALLELISM"] = "true"
from dotenv import load_dotenv

# Load environment variables from .env file
load_dotenv()
# 
# CONFIGURATION AND DATA MODELS
# 

@dataclass
class FinancialEntity:
    """Represents a financial entity in the knowledge graph"""
    name: str
    entity_type: str  # company, metric, regulation, person
    ticker: Optional[str] = None
    sector: Optional[str] = None
    attributes: Dict[str, Any] = None

@dataclass
class FactCheckResult:
    """Results from fact verification process"""
    claim: str
    verified: bool
    confidence: float
    supporting_evidence: List[str]
    contradicting_evidence: List[str]
    timestamp: datetime

@dataclass
class KnowledgeGraphEdge:
    """Represents relationships in the knowledge graph"""
    source: str
    target: str
    relationship_type: str
    confidence: float
    source_document: str
    timestamp: datetime

class SystemConfig:
    """Unified configuration settings for the financial AI system"""
    def __init__(self):
        # Model configuration
        self.phi4_model_path = "microsoft/phi-4"
        self.fine_tuned_model_path = "./phi4-finqa-final"
        self.max_sequence_length = 8192  
        self.embedding_model = "intfloat/multilingual-e5-large"
        self.vector_db_dimension = 1024
        
        # System configuration
        self.cache_ttl = 3600  # 1 hour
        self.fact_check_threshold = 0.8
        self.max_retrieval_docs = 10
        
        # API keys
        self.sec_api_key = os.getenv('sec_api')
        self.alpha_vantage_key = 'YWQ6A107QAJ1TPQR'

        # Add SEC EDGAR specific config:
        self.sec_user_agent = "Financial Research Bot research@example.com"
        self.sec_rate_limit = 0.11  # 10 requests per second = 0.1s, add buffer
        
        # Graph configuration
        self.embedding_dim = 1024
        self.graph_storage_path = "./financial_embeddings_db"
        self.max_workers = 5
        self.training_epochs = 30

        # GPU Memory Management
        self.gpu_memory_fraction = 0.9  
        self.max_gpu_memory = "32GiB"    # Maximum GPU memory per device
        self.cpu_offload_memory = "24GiB"  # CPU memory for offloading
        
        # Apply memory limits
        self._setup_gpu_memory_limits()
    
    def _setup_gpu_memory_limits(self):
        """Setup GPU memory limits"""
        if torch.cuda.is_available():
            # Clear any existing allocations first
            torch.cuda.empty_cache()
            torch.cuda.ipc_collect()
            
            # Set much more conservative memory fraction
            torch.cuda.set_per_process_memory_fraction(self.gpu_memory_fraction)
            
            # Set environment for memory management
            os.environ['PYTORCH_CUDA_ALLOC_CONF'] = 'max_split_size_mb:128,expandable_segments:True'
            
            logger.info(f"GPU memory STRICTLY limited to {self.gpu_memory_fraction*100}% of available memory")

# 
# LANGGRAPH STATE DEFINITION
# 

# In the LANGGRAPH STATE DEFINITION section
class FinancialState(TypedDict):
    """
    The unified state for the financial agentic workflow.
    """
    # Core message handling for the conversation
    messages: Annotated[List[BaseMessage], add_messages]
    
    # The agent responsible for all knowledge graph operations
    kg_agent: Any
    
    # Results from the initial query analysis step
    analysis_results: Dict[str, Any]
    
    # The specific financial entities resolved from the query
    entities: List[Dict[str, Any]]
    
    # NEW: A dedicated key to hold insights from the GNN model
    gnn_analysis: Dict[str, Any]


def extract_json_from_response(text: str) -> Optional[Union[Dict, List]]:
    # Preferred: within ``````
    match = re.search(r"``````", text)
    if match:
        try:
            return json.loads(match.group(1))
        except json.JSONDecodeError:
            pass
    # Fallback: first {...}
    match = re.search(r"(\{[\s\S]+?\})", text)
    if match:
        try:
            return json.loads(match.group(1))
        except json.JSONDecodeError:
            pass
    # Fallback: first [...] (for lists)
    match = re.search(r"(\[[\s\S]+?\])", text)
    if match:
        try:
            return json.loads(match.group(1))
        except json.JSONDecodeError:
            pass
    # Last resort: scan for first bracket, last closing bracket (your current logic)
    try:
        first_bracket_pos = min([p for p in [text.find('{'), text.find('[')] if p != -1], default=None)
        last_bracket_pos = max(text.rfind('}'), text.rfind(']'))
        if first_bracket_pos is not None and last_bracket_pos != -1:
            potential_json = text[first_bracket_pos:last_bracket_pos+1]
            return json.loads(potential_json)
    except Exception:
        pass
    return None


# 
# ENHANCED SEC EDGAR DATA ENRICHMENT (From GNN Training Code)
# 


class KaggleXBRLDataEnricher:
    """
    ENHANCED: Handles a memory-efficient hybrid data enrichment stream with 
    robust caching, retry logic, and comprehensive error handling.
    """
    def __init__(self, user_agent="Financial Research Bot research@example.com"):
        self.user_agent = user_agent
        self.api_cache = {}
        # Debugging counters
        self.other_count = 0
        self.other_examples_shown = 0
        self.sector_samples = {}
        
    def process_and_save_data(self, output_path: str, cache_file: str = "./sec_api_cache.json"):
        """
        Main method to stream, enrich, and save company data to a .jsonl file.
        """
        # Step 1 & 2: Identify missing CIKs and update the API cache
        self._update_api_cache(cache_file)

        # Step 3: Stream from files, enrich using the cache, and write to output
        edgar_downloader = EDGARDataDownloader(user_agent=self.user_agent)
        companyfacts_dir = "edgar_data/companyfacts"
        data_generator = edgar_downloader.parse_company_facts(companyfacts_dir)
        
        processed_count = 0
        with open(output_path, 'w', encoding='utf-8') as f:
            for company in data_generator:
                enriched_company = self._enrich_single_company(company)
                f.write(json.dumps(enriched_company) + '\n')
                processed_count += 1
        
        logger.info(f" Saved {processed_count} enriched companies to '{output_path}'.")

    def _update_api_cache(self, cache_file: str):
        """
        ENHANCED: Fetches missing CIKs in batches with exponential backoff retry logic
        and saves progress incrementally to handle rate limits and interruptions gracefully.
        """
        # Step 1: Load existing cache
        try:
            with open(cache_file, 'r') as f:
                self.api_cache = json.load(f)
            logger.info(f"Loaded {len(self.api_cache)} CIKs from API cache '{cache_file}'.")
        except (FileNotFoundError, json.JSONDecodeError):
            self.api_cache = {}
            logger.info("No existing API cache found. A new one will be created.")

        # Step 2: Identify all CIKs from the source files
        edgar_downloader = EDGARDataDownloader(user_agent=self.user_agent)
        companyfacts_dir = "edgar_data/companyfacts"
        all_ciks = {c['cik'] for c in edgar_downloader.parse_company_facts(companyfacts_dir)}
        
        # Determine which CIKs are new and need to be fetched
        ciks_to_fetch = sorted(list(all_ciks - set(self.api_cache.keys())))

        if not ciks_to_fetch:
            logger.info(" API cache is already up-to-date.")
            return

        logger.info(f" Found {len(ciks_to_fetch)} new CIKs to fetch.")

        # Step 3: Fetch in batches and save progress incrementally
        batch_size = 500
        for i in range(0, len(ciks_to_fetch), batch_size):
            batch = ciks_to_fetch[i:i+batch_size]
            logger.info(f" Processing batch {i//batch_size + 1}/{-(-len(ciks_to_fetch)//batch_size)} (CIKs {i} to {i+len(batch)-1}) ")
            
            with ThreadPoolExecutor(max_workers=4) as executor:
                # Map each future to its CIK
                future_to_cik = {executor.submit(self._fetch_sec_data_for_cik, cik): cik for cik in batch}
                
                # Process results as they complete
                for future in as_completed(future_to_cik):
                    cik = future_to_cik[future]
                    result = future.result()
                    if result:  # Only add to cache if the fetch was successful
                        self.api_cache[cik] = result
            
            # Save progress after each batch
            try:
                with open(cache_file, 'w') as f:
                    json.dump(self.api_cache, f)
                logger.info(f" Batch complete. API cache now has {len(self.api_cache)} entries.")
            except Exception as e:
                logger.error(f"Could not save API cache to '{cache_file}': {e}")

    def _fetch_sec_data_for_cik(self, cik: str) -> Optional[Dict]:
        """Enhanced fetch with exponential backoff retry mechanism"""
        if not cik or not str(cik).isdigit(): 
            return None
        
        url = f"https://data.sec.gov/submissions/CIK{str(cik).zfill(10)}.json"
        headers = {'User-Agent': self.user_agent}

        # Enhanced retry mechanism with exponential backoff
        max_retries = 5
        base_delay = 1  # Start with a 1-second delay

        for attempt in range(max_retries):
            try:
                response = requests.get(url, headers=headers)
                response.raise_for_status()  # Will raise an exception for 4xx/5xx errors
                
                # Success! Pause slightly to respect rate limits even on success.
                time.sleep(0.9) 
                return response.json()

            except requests.exceptions.RequestException as e:
                # Specifically check for the "Too Many Requests" error
                if hasattr(e, 'response') and e.response is not None and e.response.status_code == 429:
                    delay = base_delay * (2 ** attempt) + np.random.uniform(0, 1)
                    logger.warning(
                        f"Rate limit exceeded for CIK {cik}. "
                        f"Retrying in {delay:.2f} seconds... (Attempt {attempt + 1}/{max_retries})"
                    )
                    time.sleep(delay)
                else:
                    # For other errors (e.g., 404 Not Found), don't retry.
                    logger.warning(f"Failed to fetch data for CIK {cik}. Reason: {e}")
                    return None  # Stop trying for this CIK
        
        # If all retries fail, log it and give up.
        logger.error(f"All {max_retries} retries failed for CIK {cik}. Giving up.")
        return None

    def _enrich_single_company(self, base_company: Dict) -> Dict:
        """Enriches a single company record with API data and a standardized sector."""
        api_data = self.api_cache.get(base_company['cik'])
        
        # Start with the base data from the file
        final_data = base_company
        
        # If API data exists, intelligently merge it, preferring API values
        # but falling back to the base data if a key is missing from the API response.
        if api_data:
            final_data = {
                'cik': base_company['cik'],  # Keep the original CIK
                'entityName': api_data.get('name') or base_company.get('entityName'),
                'tickers': api_data.get('tickers') or base_company.get('tickers', []),
                'exchanges': api_data.get('exchanges') or base_company.get('exchanges', []),
                'sic': api_data.get('sic') or base_company.get('sic'),  
                'sicDescription': api_data.get('sicDescription') or base_company.get('sicDescription'),
                'stateOfIncorporation': api_data.get('stateOfIncorporation') or base_company.get('stateOfIncorporation'),
                'fiscalYearEnd': api_data.get('fiscalYearEnd') or base_company.get('fiscalYearEnd'),
                'facts': base_company.get('facts', {})  # Always keep the detailed facts from the file
            }

        # Enhanced sector mapping
        sector = 'Other'
        sic_code_str = final_data.get('sic')
        if sic_code_str and str(sic_code_str).isdigit():
            sector = self._sic_to_sector(int(sic_code_str))
        
        # Collect sector samples for debugging
        if sector not in self.sector_samples:
            self.sector_samples[sector] = []
        
        if len(self.sector_samples[sector]) < 5:
            sample_info = f"Name: '{final_data.get('entityName', 'N/A')}', SIC: {sic_code_str}"
            self.sector_samples[sector].append(sample_info)

        final_data['market_data'] = {
            'sector': sector, 
            'industry': final_data.get('sicDescription', ''),
            'data_source': 'edgar_hybrid_enhanced'
        }
        return final_data
        
    def _sic_to_sector(self, sic_code: int) -> str:
        """Enhanced SIC to sector mapping with more comprehensive classifications"""
        if not sic_code or not 100 <= sic_code <= 9999: 
            return 'Other'
        
        # Special cases first
        if 2836 == sic_code: return 'Biotechnology'
        if 5961 == sic_code: return 'E-commerce'
        
        # Standard SIC ranges
        if 100 <= sic_code <= 999: return 'Agriculture'
        if 1000 <= sic_code <= 1499: return 'Mining'
        if 1500 <= sic_code <= 1799: return 'Construction'
        
        if 2000 <= sic_code <= 3999:
            # Technology manufacturing subcategories
            tech_mfg = [(3570, 3579), (3600, 3699), (3812, 3812), (3823, 3829), (3840, 3849)]
            if any(s <= sic_code <= e for s, e in tech_mfg): 
                return 'Technology'
            # Healthcare/biotech manufacturing
            if 2834 <= sic_code <= 2836: 
                return 'Healthcare'
            return 'Manufacturing'
        
        if 4000 <= sic_code <= 4999:
            # Telecom/tech services
            if 4800 <= sic_code <= 4899: 
                return 'Technology'
            return 'Transportation & Public Utilities'
        
        if 5000 <= sic_code <= 5199: return 'Wholesale Trade'
        if 5200 <= sic_code <= 5999: return 'Retail Trade'
        if 6000 <= sic_code <= 6799: return 'Financial Services'
        
        if 7000 <= sic_code <= 8999:
            # Technology services subcategories
            tech_svc = [(7370, 7379), (7380, 7389), (8711, 8711), (8748, 8748)]
            if any(s <= sic_code <= e for s, e in tech_svc): 
                return 'Technology'
            # Healthcare services
            if 8000 <= sic_code <= 8099: 
                return 'Healthcare'
            # Entertainment & Media
            ent_media = [(7800, 7841), (4830, 4841)]
            if any(s <= sic_code <= e for s, e in ent_media): 
                return 'Entertainment & Media'
            return 'Services'
        
        if 9100 <= sic_code <= 9729: return 'Public Administration'
        return 'Other'

    def enrich_companies_parallel(self, companies_data, max_workers=3):
        """Legacy method maintained for compatibility"""
        logger.info("Using enhanced KaggleXBRLDataEnricher for parallel enrichment...")
        return self.process_and_save_data("./enriched_companies.jsonl")

    def fetch_edgar_company_data(self, cik):
        """Legacy method wrapper for compatibility"""
        return self._fetch_sec_data_for_cik(str(cik))

class SECAPITool:
    """Enhanced SEC API Tool with Yahoo Finance fallback"""

    def __init__(self, user_agent="Financial Research Bot research@example.com"):
        self.user_agent = user_agent
        self.headers = {"User-Agent": self.user_agent}
        # Cache for company name to ticker mappings
        self.ticker_cache = {}
        
    def get_ticker_from_company_name(self, company_name: str) -> str:
        """
        Convert company name to ticker symbol using Yahoo Finance Search API
        """
        # Check cache first
        if company_name in self.ticker_cache:
            return self.ticker_cache[company_name]
            
        try:
            url = "https://query2.finance.yahoo.com/v1/finance/search"
            
            headers = {
                'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/108.0.0.0 Safari/537.36'
            }
            
            params = {
                "q": company_name,
                "quotes_count": 1,
                "country": "United States"
            }
            
            response = requests.get(url=url, params=params, headers=headers)
            response.raise_for_status()
            
            data = response.json()
            
            # Extract ticker from response
            if data.get('quotes') and len(data['quotes']) > 0:
                ticker = data['quotes'][0]['symbol']
                self.ticker_cache[company_name] = ticker  # Cache the result
                return ticker
                
        except Exception as e:
            logger.error(f"Error getting ticker for {company_name}: {e}")
        
        return None

    def get_yahoo_finance_data(self, company_name: str) -> Optional[Dict[str, Any]]:
        """
        Get comprehensive financial data from Yahoo Finance
        """
        # First, get the ticker symbol
        ticker = self.get_ticker_from_company_name(company_name)
        if not ticker:
            logger.info(f" ticker not found for {company_name}")
            return None
            
        try:
            # Create yfinance Ticker object
            logger.info(f" ticker {ticker} found for {company_name}")
            stock = yf.Ticker(ticker)
            info = stock.info
            
            if not info:
                return None
                
            # Extract key financial metrics
            financial_data = {
                "entityName": info.get('longName') or info.get('shortName') or company_name,
                "ticker": ticker,
                "sector": info.get('sector', 'N/A'),
                "industry": info.get('industry', 'N/A'),
                
                # Valuation Ratios
                "P/E Ratio": info.get('trailingPE'),
                "Forward P/E": info.get('forwardPE'), 
                "P/B Ratio": info.get('priceToBook'),
                "P/S Ratio": info.get('priceToSalesTrailing12Months'),
                "PEG Ratio": info.get('pegRatio'),
                
                # Market Data
                "Market Cap": info.get('marketCap'),
                "Current Price": info.get('currentPrice') or info.get('regularMarketPrice'),
                "52 Week High": info.get('fiftyTwoWeekHigh'),
                "52 Week Low": info.get('fiftyTwoWeekLow'),
                
                # Financial Metrics
                "Revenue": info.get('totalRevenue'),
                "Revenue Per Share": info.get('revenuePerShare'),
                "Total Cash": info.get('totalCash'),
                "Total Debt": info.get('totalDebt'),
                "Book Value": info.get('bookValue'),
                "Enterprise Value": info.get('enterpriseValue'),
                
                # Profitability
                "Profit Margins": info.get('profitMargins'),
                "Operating Margins": info.get('operatingMargins'),
                "ROE": info.get('returnOnEquity'),
                "ROA": info.get('returnOnAssets'),
                
                # Dividend Info
                "Dividend Yield": info.get('dividendYield'),
                "Dividend Rate": info.get('dividendRate'),
                
                # Growth & Performance
                "Revenue Growth": info.get('revenueGrowth'),
                "Earnings Growth": info.get('earningsGrowth'),
                "Beta": info.get('beta'),
                
                # Share Information
                "Shares Outstanding": info.get('sharesOutstanding'),
                "Float": info.get('floatShares'),
                
                "data_source": "yahoo_finance"
            }
            
            # Clean up None values and format numbers
            cleaned_data = {}
            for key, value in financial_data.items():
                if value is not None and key != "data_source":
                    if isinstance(value, (int, float)) and abs(value) > 1000000:
                        # Keep large numbers as is for calculations
                        cleaned_data[key] = value
                    else:
                        cleaned_data[key] = value
                elif key in ["entityName", "ticker", "sector", "industry", "data_source"]:
                    cleaned_data[key] = value or "N/A"
                   
            return cleaned_data
            
        except Exception as e:
            logger.error(f"Yahoo Finance API failed for {company_name} ({ticker}): {e}")
            return None

    def get_live_company_data(self, cik: str) -> Optional[Dict[str, Any]]:
        """
        ENHANCED: Try SEC first, then fallback to Yahoo Finance
        """
        # Step 1: Try SEC EDGAR API first
        sec_data = super().get_live_company_data(cik)
        
        if sec_data and any(isinstance(v, (int, float)) and v > 0 for v in sec_data.values() if v):
            logger.info(f" Successfully fetched SEC data for CIK {cik}")
            return sec_data
            
        # Step 2: Fallback to Yahoo Finance using company name lookup
        logger.info(f"SEC data insufficient for CIK {cik}. Trying Yahoo Finance fallback...")
        
        # Try to get company name from SEC first
        company_name = None
        if sec_data and sec_data.get('entityName'):
            company_name = sec_data['entityName']
        else:
            # Try a basic SEC submission call to get just the company name
            try:
                formatted_cik = str(cik).zfill(10)
                submissions_url = f"https://data.sec.gov/submissions/CIK{formatted_cik}.json"
                response = requests.get(submissions_url, headers=self.headers)
                if response.status_code == 200:
                    data = response.json()
                    company_name = data.get('name')
            except:
                pass
                
        if company_name:
            yahoo_data = self.get_yahoo_finance_data(company_name)
            if yahoo_data:
                logger.info(f" Successfully fetched Yahoo Finance data for {company_name}")
                return yahoo_data
                
        logger.warning(f"Both SEC and Yahoo Finance failed for CIK {cik}")
        return None
        
# 
# PHI-4 FINE-TUNED MODEL HANDLER
# 

class FineTunedPhi4Agent:
    """Handler for fine-tuned Phi-4 model"""

    def __init__(self, config: SystemConfig):
        self.config = config
        self.model = None
        self.tokenizer = None
        self.load_model()

    def _count_tokens(self, text: str) -> int:
        """Counts the actual number of tokens in a string."""
        if not self.tokenizer:
            return len(text) // 4 # Fallback estimation
        return len(self.tokenizer.encode(text))
    
    def load_model(self):
        """Load the fine-tuned Phi-4 model"""
        try:
            
            self.tokenizer = AutoTokenizer.from_pretrained(self.config.fine_tuned_model_path)
            self.tokenizer.pad_token = self.tokenizer.eos_token

            # Load base model first
            base_model = AutoModelForCausalLM.from_pretrained(
                self.config.phi4_model_path,
                torch_dtype=torch.bfloat16,
                trust_remote_code=True,
                device_map="auto",
                attn_implementation="flash_attention_2",
                use_cache=True,
                offload_folder="./offload",
            )

            # Try to load PEFT adapters
            try:
                self.model = PeftModel.from_pretrained(
                    base_model,
                    self.config.fine_tuned_model_path,
                    offload_folder="./offload"
                )
                logger.info("Fine-tuned Phi-4 model loaded successfully")
            except:
                self.model = base_model
                logger.info("Base Phi-4 model loaded (fine-tuned version unavailable)")

        except Exception as e:
            logger.error(f"Error loading model: {e}")
            # Use a placeholder model
            self.model = None
            self.tokenizer = None

    def generate(self, prompt: str, max_new_tokens: int = 150, temperature: float = 0.7, do_sample: bool = True) -> str:
        """Generate response using the model with proper tokenization handling."""
        if not self.model or not self.tokenizer:
            return f"Model unavailable. Response for: {prompt[:100]}..."
    
        # Build messages in training format
        messages = [
            {"role": "system", "content": "You are a financial assistant. Reason step-by-step to answer the user's question based on the provided context."},
            {"role": "user", "content": prompt}
        ]
    
        # Apply chat template
        try:
            final_prompt = self.tokenizer.apply_chat_template(
                messages,
                tokenize=False,
                add_generation_prompt=True
            )
        except Exception as e:
            logger.warning(f"Chat template failed, using fallback: {e}")
            final_prompt = f"<|system|>\nYou are a financial assistant.\n<|end|>\n<|user|>\n{prompt}\n<|end|>\n<|assistant|>\n"
    
        # Tokenize with proper settings
        inputs = self.tokenizer(
            final_prompt,
            return_tensors="pt",
            truncation=True,
            max_length=self.config.max_sequence_length - max_new_tokens,
            padding=False
        )
    
        # Move to device
        device = next(self.model.parameters()).device
        inputs = {k: v.to(device) for k, v in inputs.items()}
    
        # Generate with proper stopping criteria
        try:
            with torch.no_grad():
                outputs = self.model.generate(
                    **inputs,
                    max_new_tokens=max_new_tokens,
                    temperature=temperature if do_sample else 1.0,
                    do_sample=do_sample,
                    pad_token_id=self.tokenizer.eos_token_id,
                    eos_token_id=self.tokenizer.eos_token_id,
                    repetition_penalty=1.2,
                    num_return_sequences=1,
                    # Remove early_stopping=True as it's not valid for this model
                    use_cache=True
                )
        except Exception as e:
            logger.error(f"Generation failed: {e}")
            return "I apologize, but I encountered an error generating the response."
    
        # Correct slicing - use shape[1] not shape[26]
        input_length = inputs['input_ids'].shape[1]
        new_tokens = outputs[0][input_length:]  
        
        try:
            response = self.tokenizer.decode(
                new_tokens, 
                skip_special_tokens=True,
                clean_up_tokenization_spaces=True
            )
        except Exception as e:
            logger.error(f"Decoding failed: {e}")
            return "I apologize, but I encountered an error decoding the response."
    
        # Clean up artifacts
        response = response.replace("<|end|>", "").replace("<|assistant|>", "")
        
        # Fix character ladder issue by removing excessive newlines
        response = re.sub(r'\n\s*\n\s*\n+', '\n\n', response)  # Max 2 consecutive newlines
        response = re.sub(r'(\w)\n(\w)', r'\1 \2', response)    # Fix broken words
        
        return response.strip()

                 

    def extract_claims(self, text: str) -> List[str]:
        """Extract factual claims from text for verification"""
        prompt = f"""
        Extract specific factual claims from this financial text:
        Text: {text}
        
        Return only concrete, verifiable claims as a JSON list:
        ["claim1", "claim2", "claim3"]
        """
        
        response = self.generate(prompt, max_new_tokens=200, temperature=0.1)
        
        claims = extract_json_from_response(response)
        if claims and isinstance(claims, list):
            return claims
        else:
            # Fallback: simple sentence splitting
            sentences = [s.strip() for s in text.split('.') if len(s.strip()) > 10]
            return sentences[:5]

    def analyze_query(self, query: str) -> Dict[str, Any]:
        """Enhanced query analysis with better routing logic"""
        
        strict_prompt = f"""You are an expert financial query analyzer. Your task is to analyze the user's query and return a JSON object with the query type and any identified financial entities. Your response must be ONLY the JSON object.
    
        **Rules:**
        - **query_type**: Must be one of "local", "global", or "numerical".
        - **Query Types Defined:**
          - "local": Finding similar companies, comparisons between specific entities
          - "global": Asking for specific financial data/ratios of a single company, general market questions
          - "numerical": Mathematical calculations with specific numbers provided in query
        - **entities**: A list of company names. Always include entities even for "global" queries if companies are mentioned.
        
        **Examples:**
        - Query: "Find companies similar to Apple Inc" → {{"query_type": "local", "entities": ["Apple Inc"]}}
        - Query: "What is the P/E ratio of Apple?" → {{"query_type": "global", "entities": ["Apple Inc"]}}
        - Query: "What are the top tech companies?" → {{"query_type": "global", "entities": []}}
        - Query: "If a company has $500 in revenue and $100 in profit, what is the profit margin?" → {{"query_type": "numerical", "entities": []}}
    
        **User Query to Analyze:**
        "{query}"
        
        **JSON Output:**
        """
        
        try:
            response = self.generate(strict_prompt, max_new_tokens=100, temperature=0.1)
            analysis = extract_json_from_response(response)
            
            if analysis and "query_type" in analysis and "entities" in analysis:
                return analysis
                
        except Exception as e:
            logger.warning(f"Strict JSON prompt failed: {e}")
        
        # Pattern-based fallback with better logic
        return self._pattern_based_analysis_enhanced(query)
    
    def _pattern_based_analysis_enhanced(self, query: str) -> Dict[str, Any]:
        """Enhanced pattern-based analysis with better query type detection"""
        
        # Extract entities
        company_pattern = r'\b(?:[A-Z][A-Za-z-&\'\.]+\s?)+(?:Inc|Corp|LLC|Ltd|Co|Group|PLC)?\.?\b'
        ticker_pattern = r'\b([A-Z]{2,5})\b'
        
        entities = re.findall(company_pattern, query, re.IGNORECASE)
        entities.extend(m for m in re.findall(ticker_pattern, query) if len(m) > 1)
        
        # Filter stopwords
        stopwords = {"Find", "Compare", "What", "Show", "The", "Who", "When", "Where", "Why", "How", "Companies", "That", "Are"}
        final_entities = [e.strip() for e in entities if e.strip().title() not in stopwords]
        
        # Better query type detection
        query_lower = query.lower()
        
        # Check for similarity queries FIRST (highest priority)
        competitor_patterns = [
            'similar', 'like', 'competitors', 'compete', 'peers', 'vs', 
            'comparable', 'rivalry', 'alternative', 'substitute'
        ]
        if any(pattern in query_lower for pattern in competitor_patterns):
            return {"query_type": "local", "entities": final_entities}
        
        # Check for numerical patterns
        if any(calc_word in query_lower for calc_word in ['calculate', 'compute', 'if', 'given that', '%', 'margin']):
            if re.search(r'\$?\d+', query):  # Contains specific numbers
                return {"query_type": "numerical", "entities": final_entities}
        
        # Check for specific data requests about entities
        if final_entities and any(data_word in query_lower for data_word in ['ratio', 'price', 'revenue', 'income', 'assets', 'what is']):
            return {"query_type": "global", "entities": final_entities}
        
        # Default logic
        query_type = "local" if final_entities else "global"
        
        return {
            "query_type": query_type,
            "entities": final_entities,
            "intent": "search",
            "complexity": 0.5,
            "extraction_method": "enhanced_pattern_based_fallback"
        }


    
    def _extract_json_with_multiple_strategies(self, response: str) -> Dict[str, Any]:
        """Multiple JSON extraction strategies"""
        
        strategies = [
            # Strategy 1: Direct JSON parsing
            lambda r: json.loads(r.strip()),
            
            # Strategy 2: Regex extraction
            lambda r: json.loads(re.search(r'\{.*\}', r, re.DOTALL).group()),
            
            # Strategy 3: Line-by-line parsing
            lambda r: json.loads('\n'.join([line.strip() for line in r.split('\n') if line.strip().startswith('{')][0])),
            
            # Strategy 4: Remove common prefixes
            lambda r: json.loads(re.sub(r'^[^{]*', '', r).split('\n')[0])
        ]
        
        for i, strategy in enumerate(strategies):
            try:
                result = strategy(response)
                if isinstance(result, dict):
                    logger.info(f"JSON extraction succeeded with strategy {i+1}")
                    return result
            except:
                continue
        
        return None

    
    def _validate_analysis_structure(self, analysis: Dict) -> bool:
        """Validate that analysis has required structure"""
        if not analysis or not isinstance(analysis, dict):
            return False
        
        required_keys = ["intent", "entities", "complexity"]
        return all(key in analysis for key in required_keys)

# 
# GRAPHSAGE EMBEDDINGS AND TRAINING
# 

class MultilingualE5Embedder:
    """
    Multilingual-E5-Large embedder using transformers library
    Replaces SentenceTransformer with better performance and 1024-dimensional outputs
    """
    
    def __init__(self, model_name='intfloat/multilingual-e5-large', device=None):
        self.model_name = model_name
        self.device = device or ("cuda" if torch.cuda.is_available() else "cpu")
        
        # Load tokenizer and model
        self.tokenizer = AutoTokenizer.from_pretrained(model_name)
        self.model = AutoModel.from_pretrained(
            model_name,
            torch_dtype=torch.bfloat16,  # Use fp16 to save memory
        ).to(self.device)
        self.model.eval()
        
        logger.info(f"Loaded {model_name} on {self.device}")
    
    def average_pool(self, last_hidden_states: Tensor, attention_mask: Tensor) -> Tensor:
        """Average pooling for sentence embeddings"""
        last_hidden = last_hidden_states.masked_fill(~attention_mask[..., None].bool(), 0.0)
        return last_hidden.sum(dim=1) / attention_mask.sum(dim=1)[..., None]
    
    def format_text_with_prefix(self, text: str, text_type: str = "passage") -> str:
        """Format text with appropriate E5 prefixes for optimal performance"""
        if text_type == "query":
            return f"query: {text}"
        elif text_type == "passage":
            return f"passage: {text}"
        else:
            return f"passage: {text}"  # Default to passage
    
    def encode(self, texts: Union[str, List[str]], 
               text_type: str = "passage", 
               normalize: bool = True, 
               batch_size: int = 32,
               show_progress: bool = False) -> np.ndarray:
        """
        Encode texts to embeddings with proper E5 prefixes
        """
        
        # Handle single text input
        if isinstance(texts, str):
            texts = [texts]
        
        # Handle empty text lists
        if not texts:
            logger.warning("Empty text list provided to encoder. Returning empty array.")
            return np.empty((0, 1024), dtype=np.float32)  # Return empty array with correct shape
        
        # Format texts with prefixes
        formatted_texts = [self.format_text_with_prefix(text, text_type) for text in texts]
        
        all_embeddings = []
        
        for i in range(0, len(formatted_texts), batch_size):
            batch_texts = formatted_texts[i:i+batch_size]
            
            # Tokenize batch
            batch_dict = self.tokenizer(
                batch_texts, 
                max_length=512,  # E5 model max length
                padding=True, 
                truncation=True, 
                return_tensors='pt'
            )
            
            # Move to device
            batch_dict = {k: v.to(self.device) for k, v in batch_dict.items()}
            
            # Generate embeddings
            with torch.no_grad():
                outputs = self.model(**batch_dict)
                embeddings = self.average_pool(outputs.last_hidden_state, batch_dict['attention_mask'])
                
                if normalize:
                    embeddings = F.normalize(embeddings, p=2, dim=1)
                
                all_embeddings.append(embeddings.cpu())
            
            if show_progress and i % (batch_size * 10) == 0:
                logger.info(f"Processed {i + len(batch_texts)}/{len(formatted_texts)} texts")
        
        # Handle empty embeddings list
        if not all_embeddings:
            logger.warning("No embeddings generated. Returning empty array.")
            return np.empty((0, 1024), dtype=np.float32)
        
        # Concatenate all embeddings
        final_embeddings = torch.cat(all_embeddings, dim=0)
        return final_embeddings.to(torch.float32).numpy()

    
    def encode_queries(self, queries: Union[str, List[str]], **kwargs) -> np.ndarray:
        """Encode queries with query prefix"""
        return self.encode(queries, text_type="query", **kwargs)
    
    def encode_passages(self, passages: Union[str, List[str]], **kwargs) -> np.ndarray:
        """Encode passages with passage prefix"""
        return self.encode(passages, text_type="passage", **kwargs)


class EnhancedFinancialKnowledgeGraph:
    """
    REFACTORED: This version is designed to be built entirely from a data stream
    to handle datasets that are larger than available RAM.
    """
    
    def __init__(self):
        self.graph = nx.DiGraph()
        self.entity_attributes = {}
        self.relationship_types = set()
        self.feature_names = [
            'type_company', 'type_metric', 'type_industry', 'type_market_metric',
            'total_assets', 'revenues', 'net_income', 'stockholders_equity',
            'mkt_current_price', 'mkt_market_cap', 'mkt_pe_ratio', 'mkt_pb_ratio',
            'mkt_dividend_yield', 'mkt_beta', 'mkt_return_on_equity', 'mkt_debt_to_equity',
            'sic_numeric', 'num_exchanges', 'num_tickers'
        ]
        self.feature_map = {name: i for i, name in enumerate(self.feature_names)}
        self.industry_to_companies = {}

    def build_graph_from_stream(self, companies_data_generator: iter):
        """
        NEW: Builds the graph in two passes directly from the stream.
        Pass 1 adds all nodes. Pass 2 adds complex edges that require global context.
        """
        #  PASS 1: Add all nodes and their attributes from the stream 
        logger.info("Building graph - Pass 1/2: Streaming and adding all nodes...")
        for company in companies_data_generator:
            self._add_company_node(company)
            # Also add simple, single-company relationships during this pass
            self._add_financial_relationships(company)
            self._add_industry_relationships(company)
            self._add_market_relationships(company)
        
        logger.info(f"Pass 1 complete. Graph has {self.graph.number_of_nodes()} nodes.")

        #  PASS 2: Add relationships that require all nodes to be present 
        logger.info("Building graph - Pass 2/2: Adding peer and market cap relationships...")
        self._group_companies_by_industry_from_graph() # This now works on the graph itself
        self._add_peer_relationships()
        self._add_market_cap_relationships()
        
        logger.info(f"Built graph with {self.graph.number_of_nodes()} nodes and {self.graph.number_of_edges()} edges.")
        self._analyze_graph_connectivity()

    def _group_companies_by_industry_from_graph(self):
        """NEW: Groups companies by iterating over the graph nodes, not a list."""
        logger.info("Grouping companies by sector directly from graph nodes...")
        for node_id, attrs in self.graph.nodes(data=True):
            if attrs.get('type') == 'company':
                sector = attrs.get('sector', 'Other')
                if sector not in self.industry_to_companies:
                    self.industry_to_companies[sector] = []
                self.industry_to_companies[sector].append(node_id)
        
        total_grouped = sum(len(c) for c in self.industry_to_companies.values())
        logger.info(f"Grouped {total_grouped} companies into {len(self.industry_to_companies)} sectors.")
    
    def _analyze_graph_connectivity(self):
        """Analyze and report graph connectivity"""
        
        if self.graph.number_of_nodes() == 0:
            logger.warning("Graph is empty!")
            return
        
        # Convert to undirected for connectivity analysis
        undirected = self.graph.to_undirected()
        
        # Find connected components
        components = list(nx.connected_components(undirected))
        largest_component = max(components, key=len) if components else set()
        
        logger.info(f"Graph connectivity analysis:")
        logger.info(f"  Total nodes: {self.graph.number_of_nodes()}")
        logger.info(f"  Total edges: {self.graph.number_of_edges()}")
        logger.info(f"  Connected components: {len(components)}")
        logger.info(f"  Largest component size: {len(largest_component)} ({len(largest_component)/self.graph.number_of_nodes()*100:.1f}%)")
        
        if len(components) > 1:
            logger.warning(f"Graph has {len(components)} disconnected components!")
            component_sizes = sorted([len(comp) for comp in components], reverse=True)
            logger.info(f"  Component sizes: {component_sizes[:10]}")  # Show top 10

            
    def _add_company_node(self, company):
        """
        MODIFIED: Adds a company node but no longer stores the huge raw 'facts' 
        dictionary as a node attribute to save significant memory.
        """
        cik = str(company['cik'])
        entity_name = company['entityName']
        market_data = company.get('market_data', {})
        
        # This is the raw financial data, which can be very large
        facts = company.get('facts', {})
        
        # We extract the necessary numerical metrics from 'facts'
        financial_attrs = self._extract_financial_metrics(facts)
        market_attrs = self._process_market_attributes(market_data)

        node_attrs = {
            'type': 'company',
            'name': entity_name,
            'cik': cik,
            'sector': market_data.get('sector', 'Other'), 
            'tickers': company.get('tickers', []),
            'sic': company.get('sic', ''),
            'sic_description': company.get('sicDescription', ''),
            'state': company.get('stateOfIncorporation', ''),
            'fiscal_year_end': company.get('fiscalYearEnd', ''),
            'exchanges': company.get('exchanges', [])
        }
        
        # Add the extracted financial and market metrics
        node_attrs.update(financial_attrs)
        node_attrs.update(market_attrs)
        
        # CRITICAL CHANGE: We DO NOT add the raw 'facts' object to the node.
        # The 'financial_attrs' already contain everything we need for GNN features.
        
        self.graph.add_node(cik, **node_attrs)
        self.entity_attributes[cik] = node_attrs


    def _add_peer_relationships(self):
        """
        Adds peer relationships using a capped random sampling strategy
        for ALL sectors to prevent combinatorial explosion and over-density.
        """
        logger.info(f"Creating peer-to-peer edges for {len(self.industry_to_companies)} industries...")
        edge_count = 0
        # Set a hard limit on connections per company to keep the graph sparse
        max_connections_per_company = 15 
        
        import random
        
        for sector, companies in self.industry_to_companies.items():
            if len(companies) <= 1:
                continue
            
            logger.info(f"  Processing {sector} with {len(companies)} companies")
            
            # Apply the same safe, sampling logic to every sector
            for company in companies:
                # Find potential peers (all other companies in the sector)
                peers = [c for c in companies if c != company]
                
                # Determine how many connections to make
                num_to_connect = min(max_connections_per_company, len(peers))
                selected_peers = random.sample(peers, num_to_connect)
                
                for peer in selected_peers:
                    # Add a single directed edge (will be treated as undirected by GNNs later)
                    if not self.graph.has_edge(company, peer):
                        self.graph.add_edge(company, peer, relationship='peer_of')
                        edge_count += 1
                        
        self.relationship_types.add('peer_of')
        logger.info(f"Added {edge_count} 'peer_of' edges to the graph.")

    
    def _add_market_cap_relationships(self):
        """Add relationships between companies of similar market cap"""
        
        logger.info("Adding market cap similarity relationships...")
        
        # Group companies by market cap ranges
        cap_ranges = {
            'mega_cap': [],    # > 200B
            'large_cap': [],   # 10B - 200B  
            'mid_cap': [],     # 2B - 10B
            'small_cap': []    # < 2B
        }
        
        for node_id, attrs in self.graph.nodes(data=True):
            if attrs.get('type') == 'company':
                market_cap = attrs.get('mkt_market_cap', 0)
                
                if market_cap > 200e9:
                    cap_ranges['mega_cap'].append(node_id)
                elif market_cap > 10e9:
                    cap_ranges['large_cap'].append(node_id)
                elif market_cap > 2e9:
                    cap_ranges['mid_cap'].append(node_id)
                else:
                    cap_ranges['small_cap'].append(node_id)
        
        # Create connections within each market cap range
        edge_count = 0
        for cap_range, companies in cap_ranges.items():
            if len(companies) > 1:
                # Connect each company to a few others in the same cap range
                import random
                for company in companies:
                    peers = [c for c in companies if c != company]
                    selected_peers = random.sample(peers, min(5, len(peers)))
                    
                    for peer in selected_peers:
                        if not self.graph.has_edge(company, peer):
                            self.graph.add_edge(company, peer, relationship='similar_market_cap')
                            edge_count += 1
        
        logger.info(f"Added {edge_count} market cap similarity edges")

        
    def _extract_financial_metrics(self, facts):
        """Extract key financial metrics from SEC facts"""
        
        financial_attrs = {}
        
        # Enhanced GAAP metrics
        gaap_metrics = {
            'Assets': 'total_assets',
            'Revenues': 'revenues',
            'NetIncomeLoss': 'net_income',
            'StockholdersEquity': 'stockholders_equity',
            'CashAndCashEquivalentsAtCarryingValue': 'cash',
            'LongTermDebt': 'long_term_debt'
        }
        
        us_gaap = facts.get('us-gaap', {})
        
        for gaap_key, attr_name in gaap_metrics.items():
            if gaap_key in us_gaap:
                metric_data = us_gaap[gaap_key]
                if 'units' in metric_data and 'USD' in metric_data['units']:
                    usd_data = metric_data['units']['USD']
                    if usd_data:
                        # Get most recent annual value
                        latest_value = max(usd_data, key=lambda x: x.get('end', ''))
                        financial_attrs[attr_name] = latest_value.get('val', 0)
        
        return financial_attrs
    
    def _process_market_attributes(self, market_data):
        """Process and normalize market data attributes"""
        
        market_attrs = {}
        
        # Direct market attributes
        market_fields = [
            'current_price', 'market_cap', 'pe_ratio', 'pb_ratio',
            'dividend_yield', 'beta', 'profit_margins',
            'return_on_equity', 'debt_to_equity', 'revenue_growth',
            'year_return'
        ]
        
        for field in market_fields:
            value = market_data.get(field, 0)
            # Handle None values and convert to float
            if value is not None and value != '':
                try:
                    market_attrs[f"mkt_{field}"] = float(value)
                except (ValueError, TypeError):
                    market_attrs[f"mkt_{field}"] = 0.0
            else:
                market_attrs[f"mkt_{field}"] = 0.0
        
        return market_attrs
    
    def _add_market_relationships(self, company):
        """Add market data relationships and nodes"""
        
        cik = str(company['cik'])
        market_data = company.get('market_data', {})
        tickers = company.get('tickers', [])
        
        if not tickers or not market_data:
            return
            
        ticker = tickers[0]
        
        # Add price node
        current_price = market_data.get('current_price', 0)
        if current_price and current_price > 0:
            price_node = f"price_{ticker}"
            if not self.graph.has_node(price_node):
                self.graph.add_node(price_node, 
                                  type='market_metric',
                                  metric='current_price',
                                  value=current_price,
                                  ticker=ticker)
            self.graph.add_edge(cik, price_node, relationship='has_price')
            self.relationship_types.add('has_price')
    
    def _add_financial_relationships(self, company):
        """Add financial relationships based on metrics"""
        
        cik = str(company['cik'])
        facts = company.get('facts', {})
        
        # Add metric nodes and relationships
        us_gaap = facts.get('us-gaap', {})
        for metric_name, metric_data in list(us_gaap.items())[:5]:  # Limit to avoid too many nodes
            if 'units' in metric_data:
                metric_node = f"metric_{metric_name}"
                
                if not self.graph.has_node(metric_node):
                    self.graph.add_node(metric_node, type='metric', name=metric_name)
                
                self.graph.add_edge(cik, metric_node, relationship='reports')
                self.relationship_types.add('reports')
                
    def _add_industry_relationships(self, company):
        """Add industry-based relationships"""
        
        cik = str(company['cik'])
        sic = company.get('sic', '')
        sic_description = company.get('sicDescription', '')
        
        if sic and sic_description:
            industry_node = f"industry_{sic}"
            
            if not self.graph.has_node(industry_node):
                self.graph.add_node(industry_node, 
                                  type='industry', 
                                  sic=sic, 
                                  description=sic_description)
            
            self.graph.add_edge(cik, industry_node, relationship='belongs_to')
            self.relationship_types.add('belongs_to')
    
    def get_enhanced_node_features(self, node_id):
        """
        Extracts enhanced numerical features for a node in a consistent, predefined order.
        """
        # 1. Handle nodes that might not be in the graph (e.g., during edge cases)
        if node_id not in self.graph.nodes:
            return np.zeros(len(self.feature_names), dtype=np.float32)
    
        attrs = self.graph.nodes[node_id]
        features_dict = {}  # Build features in a dictionary for robustness
    
        # 2. Type encoding (one-hot)
        node_type = attrs.get('type', 'unknown')
        features_dict['type_company'] = 1.0 if node_type == 'company' else 0.0
        features_dict['type_metric'] = 1.0 if node_type == 'metric' else 0.0
        features_dict['type_industry'] = 1.0 if node_type == 'industry' else 0.0
        features_dict['type_market_metric'] = 1.0 if node_type == 'market_metric' else 0.0
    
        # 3. SEC financial metrics (log-normalized to handle large value ranges)
        sec_metrics = ['total_assets', 'revenues', 'net_income', 'stockholders_equity']
        for metric in sec_metrics:
            value = float(attrs.get(metric, 0.0))
            features_dict[metric] = np.log1p(max(0, value))
    
        # 4. Market data features (with specific normalization)
        market_metrics = [
            'mkt_current_price', 'mkt_market_cap', 'mkt_pe_ratio', 'mkt_pb_ratio',
            'mkt_dividend_yield', 'mkt_beta', 'mkt_return_on_equity', 'mkt_debt_to_equity'
        ]
        for metric in market_metrics:
            value = float(attrs.get(metric, 0.0))
            if 'ratio' in metric or 'yield' in metric:
                normalized_value = max(-1.0, min(1.0, value))
            elif 'price' in metric or 'cap' in metric:
                normalized_value = np.log1p(max(0, value))
            else: # Covers beta and other metrics
                normalized_value = max(-3.0, min(3.0, value))
            features_dict[metric] = normalized_value
            
        # 5. SIC code encoding (normalized)
        # Handle cases where 'sic' can be None or an integer. The original code
        # failed with an AttributeError if attrs.get('sic') returned None.
        sic_raw = attrs.get('sic') 
        # Safely convert the raw value to a string. None becomes an empty string ''.
        sic = str(sic_raw) if sic_raw is not None else ''
        
        # Now that `sic` is guaranteed to be a string, this check is safe.
        sic_numeric = float(sic) if sic.isdigit() else 0.0
        features_dict['sic_numeric'] = sic_numeric / 10000.0
    
        # 6. Categorical counts
        features_dict['num_exchanges'] = float(len(attrs.get('exchanges', [])))
        features_dict['num_tickers'] = float(len(attrs.get('tickers', [])))
    
        # 7. Assemble the final feature vector in the correct order
        # This is the most important step for robustness. It uses the predefined
        # list to order the features and provides a default of 0.0 if a key is missing.
        final_features = [features_dict.get(name, 0.0) for name in self.feature_names]
        
        return np.array(final_features, dtype=np.float32)

# 
# HYBRID LLM-GNN INTEGRATION
# 

# class HybridLLMGNNFinancialSystem(nn.Module):
#     """
#     Advanced hybrid system integrating LLM and GNN with bidirectional information flow
#     """
    
#     def __init__(self, config, llm_agent: FineTunedPhi4Agent, e5_embedder: MultilingualE5Embedder):
#         super().__init__()
            
#         self.config = config
        
#         # Initialize E5 embedder (as in your existing system)
#         self.e5_embedder = e5_embedder
        
#         # GNN component
#         self.gnn = AttentionBasedFinancialGNN(
#             input_dim=1024,  # E5 embedding dimension
#             hidden_dim=config.embedding_dim,  # Use your config setting
#             num_heads=8,
#             num_layers=3
#         )
        
#         # LLM component (your existing Phi-4)
#         self.llm_agent = llm_agent
        
#         # Cross-modal fusion layers
#         self.llm_to_gnn_adapter = nn.Sequential(
#             nn.Linear(1024, config.embedding_dim),
#             nn.ReLU(),
#             nn.LayerNorm(config.embedding_dim)
#         )
        
#         gnn_output_dim = config.embedding_dim // 8  # 1024 / 8 = 128
        
#         self.gnn_to_llm_adapter = nn.Sequential(
#             nn.Linear(gnn_output_dim, 1024),
#             nn.ReLU(),
#             nn.LayerNorm(1024)
#         )
        
#         # Bidirectional attention mechanism
#         self.cross_modal_attention = nn.MultiheadAttention(
#             embed_dim=1024,
#             num_heads=8,
#             batch_first=True
#         )
    
#     def unified_prediction(self, text_data, graph_data, task='risk_assessment'):
#         """
#         Unified prediction combining LLM and GNN insights
#         """
#         # Monitor memory before heavy operations
#         if torch.cuda.is_available():
#             allocated = torch.cuda.memory_allocated() / 1024**3  # GB
#             reserved = torch.cuda.memory_reserved() / 1024**3   # GB
#             logger.info(f"Memory before prediction - Allocated: {allocated:.2f}GB, Reserved: {reserved:.2f}GB")
        
#         # Stage 1: GNN analysis
#         x, edge_index = graph_data['x'], graph_data['edge_index']
#         gnn_output = self.gnn(x, edge_index)
#         graph_embeddings = gnn_output['node_embeddings']
        
#         # Stage 2: LLM analysis
#         text_embeddings = self.e5_embedder.encode_passages(text_data)
#         text_embeddings = torch.tensor(text_embeddings).to(graph_embeddings.device)
        
#         # Stage 3: Cross-modal fusion
#         adapted_graph_features = self.gnn_to_llm_adapter(graph_embeddings)
        
#         # Enhance text embeddings with graph structure
#         enhanced_text, attention_weights = self.cross_modal_attention(
#             text_embeddings.unsqueeze(0),
#             adapted_graph_features.unsqueeze(0),
#             adapted_graph_features.unsqueeze(0)
#         )
        
#         # Generate explanation
#         explanation = self._generate_interpretation(gnn_output, enhanced_text, task)
        
#         return {
#             'gnn_output': gnn_output,
#             'enhanced_text': enhanced_text.squeeze(0),
#             'attention_weights': attention_weights,
#             'explanation': explanation
#         }
    
#     def _generate_interpretation(self, gnn_output, enhanced_text, task):
#         """Generate human-readable interpretation"""
        
#         interpretation_prompt = f"""
#         Based on the hybrid LLM-GNN analysis for {task}, provide an explanation:
        
#         Key factors influencing this {task} decision based on graph neural network 
#         attention patterns and enhanced language understanding.
        
#         Explain the reasoning in simple terms.
#         """
        
#         explanation = self.llm_agent.generate(interpretation_prompt, max_new_tokens=150)
        
#         return {
#             'explanation': explanation,
#             'confidence_factors': {
#                 'gnn_attention_strength': float(torch.mean(torch.stack([
#                     # GATv2Conv returns a tuple (edge_index, scores). We only want the scores [1].
#                     w[1].mean() for w in gnn_output.get('attention_weights', []) 
#                     if w is not None and isinstance(w, tuple) and len(w) > 1
#                 ]) if gnn_output.get('attention_weights') else torch.tensor([0.5]))),
#                 'risk_assessment': float(gnn_output['risk_scores'].mean()) if 'risk_scores' in gnn_output else 0.5
#             }
#         }
#     # Clear cache after prediction
#     if torch.cuda.is_available():
#             torch.cuda.empty_cache()

# 
# QDRANT STORAGE SYSTEM
# 

class QdrantEmbeddingStorage:
    """
    MODIFIED: A pure inference client for Qdrant that loads from and searches in
    multiple, model-specific collections.
    """
    
    def __init__(self, e5_embedder: "MultilingualE5Embedder", storage_path="./financial_embeddings_db"):
        self.storage_path = storage_path
        self.embedder = e5_embedder
        try:
            self.client = QdrantClient(path=self.storage_path)
            logger.info(f"Connected to Qdrant local mode at {self.storage_path}")
        except Exception as e:
            logger.error(f"Failed to initialize Qdrant client: {e}", exc_info=True)
            self.client = QdrantClient(":memory:")
            logger.warning("Falling back to in-memory Qdrant client.")

    def load_collection(self, collection_name: str) -> Tuple[Dict[str, np.ndarray], Dict[str, Dict]]:
        """Loads all vectors and payloads from a single specified collection."""
        embeddings_dict = {}
        node_attributes = {}
        
        try:
            self.client.get_collection(collection_name=collection_name)
            scroll_result, _ = self.client.scroll(
                collection_name=collection_name, limit=20000,
                with_payload=True, with_vectors=True
            )
            
            for point in scroll_result:
                node_id = point.payload.get("node_id", str(point.id))
                if node_id:
                    embeddings_dict[node_id] = np.array(point.vector)
                    node_attributes[node_id] = point.payload
                    
            logger.info(f"Loaded {len(embeddings_dict)} points from collection '{collection_name}'")
        except Exception as e:
            raise ValueError(f"Collection '{collection_name}' not found or is empty.") from e

        return embeddings_dict, node_attributes
        
    def search_similar_entities(self, model_type: str, query_text: Optional[str] = None, 
                                query_embedding: Optional[np.ndarray] = None, 
                                top_k: int = 10, filters: Optional[dict] = None) -> List[dict]:
        """
        MODIFIED: Searches in a model-specific collection and DEFAULTS to filtering for companies.
        """
        collection_name = f"financial_entities_{model_type}"
    
        if query_embedding is None and query_text is not None:
            query_vector = self.embedder.encode_queries([query_text])[0]
        elif query_embedding is not None:
            query_vector = query_embedding
        else:
            raise ValueError("Either query_text or query_embedding must be provided.")
    
        try:
            # Default to a company-only filter if none is provided.
            if filters is None:
                filters = {"type": "company"}
    
            qdrant_filter = self._convert_filters(filters)

            # Dynamically determine the collection to search
            collection_name = f"financial_entities_{model_type}"

            # Use the modern .search() method with correct parameters
            search_results = self.client.search(
                collection_name = collection_name,
                query_vector=query_vector.tolist(),
                query_filter=qdrant_filter,
                limit=top_k,
                with_payload=True
            )
            
            # Process results
            similar_entities = []
            for result in search_results: # Iterating directly over the result list
                payload = result.payload
                similar_entities.append({
                    "entity_id": str(result.id),
                    "name": payload.get("name", str(result.id)),
                    "type": payload.get("type", "unknown"),
                    "ticker": payload.get("ticker", ""),
                    "similarity_score": float(result.score),
                    "market_cap": payload.get("market_cap", 0),
                    "pe_ratio": payload.get("pe_ratio", 0),
                    "sector": payload.get("sector", ""),
                    "metadata": payload
                })
            
            return similar_entities
            
        except Exception as e:
            logger.error(f"Error searching similar entities in Qdrant: {e}")
            return []



    def filter_by_payload(self, model_type: str, filters: dict, limit: int = 1) -> List[dict]:
        """Retrieves points based on an exact payload filter, using scroll."""
        collection_name = f"financial_entities_{model_type}"
        try:
            qdrant_filter = self._convert_filters(filters)
    
            # Use scroll for exact filtering instead of search
            scroll_result, _ = self.client.scroll(
                collection_name=collection_name,
                scroll_filter=qdrant_filter,
                limit=limit,
                with_payload=True
            )
    
            similar_entities = []
            for result in scroll_result:
                payload = result.payload or {}
                similar_entities.append({
                    "id": result.id,
                    "name": payload.get("name", "Unknown Entity"),
                    "type": payload.get("type", "unknown"),
                    "ticker": payload.get("ticker", ""),
                    "similarity_score": 1.0,  # It's an exact match
                    "metadata": payload
                })
            return similar_entities
        except Exception as e:
            logger.error(f"Scroll-based filtering failed in collection '{collection_name}': {e}")
            return []

    
    def _convert_filters(self, filters: Optional[dict]) -> Optional[Filter]:
        """ filter conversion that handles $and, $or operators"""
        if not filters:
            return None
        
        conditions = []
        
        # Handle boolean operators
        if "$and" in filters:
            and_conditions = []
            for condition in filters["$and"]:
                and_conditions.extend(self._parse_filter_condition(condition))
            return Filter(must=and_conditions)
        
        elif "$or" in filters:
            or_conditions = []
            for condition in filters["$or"]:
                or_conditions.extend(self._parse_filter_condition(condition))
            return Filter(should=or_conditions)
        
        else:
            # Simple filters
            conditions = self._parse_filter_condition(filters)
            return Filter(must=conditions) if conditions else None
    
    def _parse_filter_condition(self, condition: dict) -> List:
        """Parse individual filter conditions"""
        conditions = []
        
        for key, value in condition.items():
            if isinstance(value, dict):
                # Range conditions like {"$gt": 10}
                range_conditions = {}
                for op, val in value.items():
                    clean_op = op.replace('$', '')
                    range_conditions[clean_op] = val
                conditions.append(FieldCondition(key=key, range=Range(**range_conditions)))
            else:
                # Exact match
                conditions.append(FieldCondition(key=key, match=MatchValue(value=value)))
        
        return conditions



# 
# EXPLAINABLE AI FOR FINANCIAL DECISIONS
# 

# class FinancialExplainerAgent:
#     """
#     Explainable AI agent for financial GNN decisions
#     """
    
#     def __init__(self, config, llm_agent: FineTunedPhi4Agent):
#         self.config = config
#         self.llm_agent = llm_agent
    
#     def explain_prediction(self, model_output, query, entity_data):
#         """
#         Generate comprehensive explanation for financial prediction
#         """
        
#         # Extract key information from model output
#         attention_weights = model_output.get('attention_weights', [])
#         risk_scores = model_output.get('risk_scores', torch.tensor([0.5]))
#         node_embeddings = model_output.get('node_embeddings', torch.tensor([]))
        
#         # Calculate importance scores
#         importance_analysis = self._analyze_importance(attention_weights, entity_data)
        
#         # Generate narrative explanation
#         narrative = self._generate_narrative_explanation(
#             query, importance_analysis, risk_scores.mean().item()
#         )
        
#         # Create evidence trail
#         evidence_trail = self._create_evidence_trail(importance_analysis)
        
#         return {
#             'prediction_explanation': narrative,
#             'evidence_trail': evidence_trail,
#             'confidence_score': self._calculate_confidence(importance_analysis),
#             'key_factors': importance_analysis['top_factors']
#         }
    
#     def _analyze_importance(self, attention_weights, entity_data):
#         """Analyze attention weights to determine important factors"""
        
#         if not attention_weights or not entity_data:
#             return {'top_factors': [], 'attention_distribution': []}
        
#         # Simple importance analysis
#         top_factors = []
        
#         # Add entity-based factors
#         for entity in entity_data[:3]:  # Top 3 entities
#             factor = {
#                 'factor_type': 'entity_similarity',
#                 'entity_name': entity.get('name', 'Unknown'),
#                 'importance_score': entity.get('similarity_score', 0.5),
#                 'description': f"Similar to {entity.get('name', 'Unknown')} in {entity.get('sector', 'unknown sector')}"
#             }
#             top_factors.append(factor)
        
#         return {
#             'top_factors': top_factors,
#             'attention_distribution': [0.8, 0.6, 0.4]  # Placeholder
#         }
    
#     def _generate_narrative_explanation(self, query, importance_analysis, risk_score):
#         """Generate human-readable narrative explanation"""
        
#         factors_text = "\n".join([
#             f"- {factor['description']} (importance: {factor['importance_score']:.2f})"
#             for factor in importance_analysis['top_factors']
#         ])
        
#         explanation_prompt = f"""
#         Explain this financial analysis result in simple terms:
        
#         Query: {query}
#         Risk Score: {risk_score:.3f}
        
#         Key Contributing Factors:
#         {factors_text}
        
#         Provide a clear, professional explanation of why this assessment was made.
#         """
        
#         explanation = self.llm_agent.generate(
#             explanation_prompt,
#             max_new_tokens=200,
#             temperature=0.2
#         )
        
#         return explanation.strip()
    
#     def _create_evidence_trail(self, importance_analysis):
#         """Create detailed evidence trail for audit purposes"""
        
#         evidence_items = []
        
#         for factor in importance_analysis['top_factors']:
#             evidence_items.append({
#                 'evidence_type': factor['factor_type'],
#                 'description': factor['description'],
#                 'confidence': factor['importance_score'],
#                 'impact_level': 'high' if factor['importance_score'] > 0.7 else 'medium' if factor['importance_score'] > 0.4 else 'low'
#             })
        
#         return evidence_items
    
#     def _calculate_confidence(self, importance_analysis):
#         """Calculate overall confidence in the explanation"""
        
#         if not importance_analysis['top_factors']:
#             return 0.5
        
#         avg_importance = sum(
#             factor['importance_score'] 
#             for factor in importance_analysis['top_factors']
#         ) / len(importance_analysis['top_factors'])
        
#         return min(avg_importance, 1.0)

# 
# NUMERICAL REASONING AGENT
# 
class EnhancedNumericalReasoning:
    """
    An agent that uses a two-stage process to solve numerical queries,
    separating reasoning from calculation to improve accuracy.
    """
    def __init__(self, phi4_agent: FineTunedPhi4Agent):
        self.phi4_agent = phi4_agent

    def numerical_query_with_steps(self, query: str) -> dict:
        """Two-stage numerical reasoning with intermediate steps."""
        
        # Stage 1: Generate a plan with reasoning and calculation steps.
        reasoning_prompt = f"""
        Break down the following financial query into reasoning steps and a list of precise mathematical operations.
        Query: {query}

        Provide your response in a pure JSON format with no extra text:
        {{
            "steps": ["step 1 describing the logic", "step 2 describing the next part of the logic"],
            "calculations": ["1280 / 1366", "(result) * 100"],
            "reasoning": "A brief explanation of the financial formula being used."
        }}
        """
        stage1_response = self.phi4_agent.generate(reasoning_prompt, max_new_tokens=200, temperature=0.2)
        intermediate_steps = extract_json_from_response(stage1_response)

        if not intermediate_steps or 'calculations' not in intermediate_steps:
            return {"error": "Failed to generate calculation steps in Stage 1."}

        # Stage 2: Execute the calculations and provide the final answer.
        final_prompt = f"""
        Given the following list of calculations: {intermediate_steps.get('calculations', [])}
        
        Execute each calculation precisely and provide the final numerical answer in a pure JSON format:
        {{
            "final_calculations": ["1280 / 1366 = 0.937", "0.937 * 100 = 93.7"],
            "answer": "93.7%"
        }}
        """
        final_response = self.phi4_agent.generate(final_prompt, max_new_tokens=150, temperature=0.2)
        
        return {
            'intermediate_steps': intermediate_steps,
            'final_result': extract_json_from_response(final_response)
        }
# 
# UNIFIED KNOWLEDGE GRAPH AGENT (COMBINING BOTH APPROACHES)
# 

class UnifiedKnowledgeGraphAgent:
    """Inference-only agent that loads pre-trained embeddings from multiple model collections"""
    
    def __init__(self, config: SystemConfig, e5_embedder: MultilingualE5Embedder):
        self.config = config
        self.storage_path = config.graph_storage_path
        self.embedding_model = e5_embedder
        
        # Initialize storage client
        self.storage = QdrantEmbeddingStorage(
            e5_embedder=self.embedding_model,
            storage_path=self.storage_path
        )
        
        # Load all available embeddings on initialization
        self.embeddings_by_model, self.node_attributes = self._load_all_embeddings()
        
        # Performance cache
        self.similarity_cache = LRUCache(maxsize=1000)
        
        logger.info(f" Loaded {sum(len(v) for v in self.embeddings_by_model.values())} embeddings "
                   f"from {len(self.embeddings_by_model)} model types")
     
    def resolve_entity(self, entity_query: str) -> List[dict]:
        """Resolves an entity using a multi-step waterfall approach before falling back."""
        
        #  Waterfall Step 1: Exact, Case-Sensitive Match 
        # The fastest and most accurate check.
        logger.info(f"Attempting exact match for '{entity_query}'...")
        exact_match_filter = {"name": entity_query}
        results = self.storage.filter_by_payload('e5', filters=exact_match_filter)
        if results:
            return results
    
        #  Waterfall Step 2: Case-Insensitive Match 
        # Handles variations like "apple" vs "Apple".
        logger.info(f"Attempting case-insensitive match for '{entity_query.title()}'...")
        caseless_match_filter = {"name": entity_query.title()}
        results = self.storage.filter_by_payload('e5', filters=caseless_match_filter)
        if results:
            return results
    
        
        #  Final Step: Semantic Search Fallback 
        #logger.info(f"No exact match found. Falling back to semantic search for '{entity_query}'...")
        return self.find_similar_entities(entity_query, model_type='e5', top_k=1)
        
    def _load_all_embeddings(self):
        """Load embeddings from all trained model collections"""
        embeddings_by_model = {}
        all_node_attributes = {}
        
        # Expected model types from your training pipeline
        model_types = ['graphsage', 'attention_gnn', 'temporal_gnn', 'e5']
        
        for model_type in model_types:
            try:
                # Search in the model-specific collection
                collection_name = f"financial_entities_{model_type}"
                embeddings, attributes = self._load_collection(collection_name)
                
                if embeddings:
                    embeddings_by_model[model_type] = embeddings
                    all_node_attributes.update(attributes)
                    logger.info(f" Loaded {len(embeddings)} {model_type} embeddings")
                    
            except Exception as e:
                logger.warning(f"⚠️ Could not load {model_type} embeddings: {e}")
        
        return embeddings_by_model, all_node_attributes
    
    def find_relationship_path(self, source_entity: str, target_entity: str) -> Optional[dict]:
        """Safe stub for relationship path finding"""
        try:
            # Find both entities
            source_results = self.find_similar_entities(source_entity, top_k=1)
            target_results = self.find_similar_entities(target_entity, top_k=1)
            
            if source_results and target_results:
                # Simple similarity-based relationship
                source_sector = source_results[0].get('metadata', {}).get('sector', '')
                target_sector = target_results.get('metadata', {}).get('sector', '')
                
                if source_sector == target_sector and source_sector:
                    return {
                        "type": "direct",
                        "path": [source_entity, target_entity],
                        "confidence": 0.8,
                        "relationships": ["same_sector"]
                    }
            
            return None
            
        except Exception as e:
            logger.warning(f"Relationship path finding failed: {e}")
            return None

    
    def _load_collection(self, collection_name):
        """
        Load ALL embeddings from a Qdrant collection by handling pagination.
        """
        try:
            embeddings_dict = {}
            node_attributes = {}
            next_page_offset = None  # Start with no offset

            # Loop until there are no more pages of results
            while True:
                # Fetch a batch of points
                scroll_result, next_page_offset = self.storage.client.scroll(
                    collection_name=collection_name,
                    limit=1000,  # A smaller batch size is more memory-efficient
                    offset=next_page_offset,
                    with_payload=True,
                    with_vectors=True
                )

                # Process the points from the current batch
                for point in scroll_result:
                    node_id = point.payload.get("node_id", str(point.id))
                    embeddings_dict[node_id] = np.array(point.vector)
                    node_attributes[node_id] = point.payload
                
                # If there's no next_page_offset, we've reached the end
                if not next_page_offset:
                    break
            
            total_loaded = len(embeddings_dict)
            if total_loaded > 0:
                logger.info(f" Loaded a total of {total_loaded} points from collection '{collection_name}'")
            
            return embeddings_dict, node_attributes
            
        except Exception as e:
            logger.error(f"Failed to load collection {collection_name}: {e}")
            return {}, {}
    
    def find_similar_entities(self, entity_name: str, model_type: str = 'e5', 
                             top_k: int = 10, filters: dict = None):
        # Now returns entities enriched with SEC EDGAR financial data
        return self.storage.search_similar_entities(
            model_type=model_type,
            query_text=entity_name,
            top_k=top_k,
            filters=filters
        )
    
    def get_available_model_types(self):
        """Return list of available embedding model types"""
        return list(self.embeddings_by_model.keys())
    
    def get_peer_companies(self, entity_name: str, model_type: str = 'graphsage', 
                          top_k: int = 5):
        """Get peer companies using graph-based embeddings"""
        
        # Find the entity first
        entity_results = self.find_similar_entities(entity_name, model_type, top_k=1)
        if not entity_results:
            return []
        
        # Get sector from the entity
        entity_data = entity_results[0].get('metadata', {})
        sector = entity_data.get('sector', '')
        
        if sector:
            # Find companies in same sector
            sector_filter = {"type": "company", "sector": sector}
            peers = self.find_similar_entities(
                entity_name=sector,
                model_type=model_type,
                top_k=top_k + 1,
                filters=sector_filter
            )
            # Remove original entity
            return [p for p in peers if p.get('name') != entity_results[0].get('name')]
        
        return []

# 
# ENHANCED RETRIEVAL SYSTEM
# 

class EnhancedRetrieverAgent:
    """Enhanced retriever with hybrid text + graph embedding search"""
    
    def __init__(self, config: SystemConfig, kg_agent: UnifiedKnowledgeGraphAgent, e5_embedder: MultilingualE5Embedder):
        self.config = config
        self.kg_agent = kg_agent
        self.embedding_model = e5_embedder
        self.vector_index = faiss.IndexFlatL2(config.vector_db_dimension)
        self.document_metadata = []
        
        # Weight factors for hybrid search
        self.text_weight = 0.6
        self.graph_weight = 0.4
    
    def hybrid_search(self, query: str, top_k: int = 10) -> List[Dict[str, Any]]:
        """Hybrid search combining document and graph embeddings"""
        
        # Text-based document search (placeholder - would integrate with actual document store)
        text_results = self._placeholder_document_search(query, top_k=top_k)
        
        # Graph-based entity search
        graph_results = self._graph_enhanced_search(query, top_k=top_k)
        
        # Combine and re-rank results
        combined_results = self._combine_search_results(text_results, graph_results)
        
        return combined_results[:top_k]
    
    def _placeholder_document_search(self, query: str, top_k: int = 5) -> List[Dict[str, Any]]:
        """Placeholder for document search - would integrate with actual document store"""
        return [
            {
                "title": f"Document about {query}",
                "content": f"This is a placeholder document discussing {query} and related financial topics.",
                "source": "Financial Database",
                "relevance_score": 0.8,
                "search_type": "text_search"
            }
        ]
    
    def _graph_enhanced_search(self, query: str, top_k: int = 5) -> List[Dict[str, Any]]:
        """Search using graph embeddings"""
        
        # Extract potential entities from query
        entities_in_query = self._extract_query_entities(query)
        
        graph_results = []
        for entity in entities_in_query:
            similar_entities = self.kg_agent.find_similar_entities(
                entity,
                top_k=top_k,
                filters={"type": "company"}  # Focus on companies for financial queries
            )
            
            for similar_entity in similar_entities:
                # Create a pseudo-document from graph entity
                content = self._create_entity_document(similar_entity)
                graph_results.append({
                    "title": f"Entity: {similar_entity['name']}",
                    "content": content,
                    "source": "Knowledge Graph",
                    "relevance_score": similar_entity["similarity_score"],
                    "entity_data": similar_entity,
                    "search_type": "graph_embedding"
                })
        
        return graph_results
    
    def _create_entity_document(self, entity_data: dict) -> str:
        """Create a document-like representation of a graph entity using SEC EDGAR data"""
        
        # Access financial data from the nested 'metadata' dictionary
        metadata = entity_data.get('metadata', {})
        
        content_parts = [
            f"Company: {entity_data.get('name', 'N/A')}",
            f"Ticker: {entity_data.get('ticker')}" if entity_data.get('ticker') else "",
            f"Sector: {metadata.get('sector')}" if metadata.get('sector') else "",
            f"Total Assets: ${metadata.get('total_assets', 0):,.0f}" if metadata.get('total_assets', 0) > 0 else "",
            f"Revenue: ${metadata.get('revenues', 0):,.0f}" if metadata.get('revenues', 0) > 0 else "",
            f"Net Income: ${metadata.get('net_income', 0):,.0f}" if metadata.get('net_income', 0) > 0 else "",
            f"Market Cap: ${metadata.get('market_cap', 0):,.0f}" if metadata.get('market_cap', 0) > 0 else "",
            f"P/E Ratio: {metadata.get('pe_ratio', 0):.2f}" if metadata.get('pe_ratio', 0) > 0 else "",
            f"ROE: {metadata.get('return_on_equity', 0):.2f}%" if metadata.get('return_on_equity', 0) > 0 else "",
        ]
        
        return " | ".join([part for part in content_parts if part])

    
    def _extract_query_entities(self, query: str) -> List[str]:
        """Extract potential entity names from query"""
        entities = []
        
        # Company name patterns
        company_patterns = [
            r'\b([A-Z][a-z]+ ?(?:Inc|Corp|LLC|Ltd|Company)\.?)\b',
            r'\b(Apple|Microsoft|Tesla|Amazon|Google|Meta|Netflix|Nvidia)\b'
        ]
        
        # Ticker patterns
        ticker_pattern = r'\b([A-Z]{2,5})\b'
        
        for pattern in company_patterns:
            entities.extend(re.findall(pattern, query, re.IGNORECASE))
        
        entities.extend(re.findall(ticker_pattern, query))
        
        return list(set(entities))
    
    def _combine_search_results(self, text_results: List[Dict], graph_results: List[Dict]) -> List[Dict]:
        """Combine and re-rank text and graph search results"""
        
        combined = []
        
        # Add text results with weighted scores
        for result in text_results:
            result["combined_score"] = result["relevance_score"] * self.text_weight
            combined.append(result)
        
        # Add graph results with weighted scores
        for result in graph_results:
            result["combined_score"] = result["relevance_score"] * self.graph_weight
            combined.append(result)
        
        # Sort by combined score
        combined.sort(key=lambda x: x["combined_score"], reverse=True)
        
        return combined

# 
# ENHANCED FACT CHECKING SYSTEM
# 

class EnhancedFactCheckerAgent:
    """Enhanced fact-checker using graph embeddings for verification"""
    
    def __init__(self, config: SystemConfig, kg_agent: UnifiedKnowledgeGraphAgent):
        self.config = config
        self.kg_agent = kg_agent
        self.cache = TTLCache(maxsize=1000, ttl=config.cache_ttl)
    
    def extract_claims(self, text: str, phi4_model) -> List[str]:
        """Extract claims from text using the LLM"""
        return phi4_model.extract_claims(text)
    
    def verify_claim_with_embeddings(self, claim: str, entities: List[str]) -> FactCheckResult:
        """Verify claims using graph embedding similarity and relationships"""
        
        cache_key = hash(f"{claim}_{str(entities)}")
        if cache_key in self.cache:
            return self.cache[cache_key]
        
        supporting_evidence = []
        contradicting_evidence = []
        confidence = 0.0
        
        # Extract entities mentioned in the claim
        claim_entities = self._extract_claim_entities(claim, entities)
        
        if len(claim_entities) >= 2:
            # Check relationship plausibility using embeddings
            source_entity = claim_entities[0]
            target_entity = claim_entities[1]
            
            # Find relationship path
            relationship_path = self.kg_agent.find_relationship_path(source_entity, target_entity)
            
            if relationship_path:
                path_confidence = relationship_path["confidence"]
                
                if relationship_path["type"] == "direct":
                    supporting_evidence.append(
                        f"Direct relationship found: {' â†’ '.join(relationship_path['path'])}"
                    )
                    confidence += 0.7 * path_confidence
                else:
                    supporting_evidence.append(
                        f"Inferred relationship via embeddings: {' â†’ '.join(relationship_path['relationships'])}"
                    )
                    confidence += 0.4 * path_confidence
            
            # Cross-verify with peer companies using embeddings
            peer_verification = self._verify_with_peer_companies(claim, source_entity)
            confidence += peer_verification["confidence"]
            supporting_evidence.extend(peer_verification["evidence"])
        
        # Numerical verification using similar companies
        if self._contains_numbers(claim):
            numerical_verification = self._verify_numerical_with_peers(claim, claim_entities)
            confidence += numerical_verification["confidence"]
            supporting_evidence.extend(numerical_verification["evidence"])
        
        result = FactCheckResult(
            claim=claim,
            verified=confidence >= self.config.fact_check_threshold,
            confidence=min(confidence, 1.0),
            supporting_evidence=supporting_evidence,
            contradicting_evidence=contradicting_evidence,
            timestamp=datetime.now()
        )
        
        self.cache[cache_key] = result
        return result
    
    def _verify_with_peer_companies(self, claim: str, entity: str) -> Dict[str, Any]:
        """Verify claim against peer companies using graph embeddings"""
        
        # Find similar companies
        peer_companies = self.kg_agent.find_similar_entities(
            entity,
            top_k=5,
            filters={"type": "company"}
        )
        
        evidence = []
        confidence = 0.0
        
        for peer in peer_companies:
            if peer["similarity_score"] > 0.8:  # High similarity threshold
                evidence.append(f"Similar pattern found in peer company: {peer['name']} (similarity: {peer['similarity_score']:.2f})")
                confidence += 0.1
        
        return {"evidence": evidence, "confidence": min(confidence, 0.5)}
    
    def _verify_numerical_with_peers(self, claim: str, entities: List[str]) -> Dict[str, Any]:
        """Verify numerical claims against peer group averages"""
        
        evidence = []
        confidence = 0.0
        
        if not entities:
            return {"evidence": evidence, "confidence": confidence}
        
        primary_entity = entities[0]
        
        # Get peer companies
        peers = self.kg_agent.find_similar_entities(
            primary_entity,
            top_k=10,
            filters={"type": "company"}
        )
        
        # Extract numerical values from claim
        numbers = re.findall(r'\d+\.?\d*', claim)
        if not numbers:
            return {"evidence": evidence, "confidence": confidence}
        
        claimed_value = float(numbers[0])
        
        # Check if value is reasonable compared to peers
        if 'pe ratio' in claim.lower() or 'p/e' in claim.lower():
            peer_pe_ratios = [p["pe_ratio"] for p in peers if p["pe_ratio"] > 0]
            if peer_pe_ratios:
                avg_pe = np.mean(peer_pe_ratios)
                std_pe = np.std(peer_pe_ratios)
                
                if abs(claimed_value - avg_pe) <= 2 * std_pe:
                    evidence.append(f"P/E ratio {claimed_value} within reasonable range of peer average {avg_pe:.2f}")
                    confidence += 0.3
                else:
                    evidence.append(f"P/E ratio {claimed_value} outside peer range (avg: {avg_pe:.2f})")
        
        return {"evidence": evidence, "confidence": confidence}
    
    def _extract_claim_entities(self, claim: str, entities: List[str]) -> List[str]:
        """Extract entities specifically mentioned in the claim"""
        claim_entities = []
        for entity in entities:
            if entity.lower() in claim.lower():
                claim_entities.append(entity)
        return claim_entities
    
    def _contains_numbers(self, text: str) -> bool:
        """Check if text contains numerical values"""
        return bool(re.search(r'\d+\.?\d*', text))


class FinancialGraphRAGQueryEngine:
    """A query engine that uses GraphRAG principles for financial analysis."""

    def __init__(self, kg_agent: UnifiedKnowledgeGraphAgent, phi4_agent: FineTunedPhi4Agent, communities: dict, sec_api_tool: SECAPITool):
        self.kg_agent = kg_agent
        self.phi4_agent = phi4_agent
        self.communities = communities
        self.e5_embedder = self.kg_agent.embedding_model # Access the embedder
        self.sec_api_tool = sec_api_tool

        # Create a searchable index for the community summaries
        self._initialize_community_index()

    def _initialize_community_index(self):
        """Creates a FAISS index for the natural language summaries of each community."""
        if not self.communities:
            logger.warning("No communities available. Creating empty community index.")
            # Create a minimal FAISS index with correct dimensions but no vectors
            self.community_index = faiss.IndexFlatL2(1024)  # E5 embedding dimension
            self.community_id_map = []
            return
        
        logger.info(f"Initializing FAISS index for {len(self.communities)} community summaries.")
        summaries = [data['summary'] for data in self.communities.values()]
        
        # Check if summaries is empty before encoding
        if not summaries:
            logger.warning("No community summaries available. Creating empty community index.")
            self.community_index = faiss.IndexFlatL2(1024)
            self.community_id_map = []
            return
        
        # Use the existing E5 embedder to create vectors for the summaries
        summary_vectors = self.e5_embedder.encode_passages(summaries).astype('float32')
        
        self.community_index = faiss.IndexFlatL2(summary_vectors.shape[1])
        self.community_index.add(summary_vectors)
        
        # Map the index position back to the original community ID
        self.community_id_map = list(self.communities.keys())
        logger.info("Community index created successfully.")


    def _get_relevant_communities(self, question: str, top_k: int = 3) -> dict:
        """Finds the most relevant communities for a global question."""
        question_vector = self.e5_embedder.encode_queries([question])
        _, indices = self.community_index.search(question_vector, top_k)
        
        relevant_communities = {}
        for idx in indices[0]:
            community_id = self.community_id_map[idx]
            relevant_communities[community_id] = self.communities[community_id]
        return relevant_communities

    def _get_entity_neighbors(self, entity_name: str, top_k: int = 5) -> List[dict]:
        """Finds related entities (peers) for a local query, simulating graph traversal."""
        return self.kg_agent.get_peer_companies(entity_name, top_k=top_k)

    
    def _build_entity_context(self, entities: List[dict]) -> str:
        """Creates a clean, human-readable text block from a list of entities."""
        context_lines = []
        for entity in entities: 
    
            # First, safely get the nested metadata dictionary
            metadata = entity.get('metadata', {})
            
            # Now, access all data from the correct locations
            name = entity.get('name', 'N/A')
            # The ticker may be at the top level or in metadata, so check both
            ticker = entity.get('ticker') or (metadata.get('tickers', ['N/A'])[0] if metadata.get('tickers') else 'N/A')
            sector = metadata.get('sector', 'N/A')
            # Use the correct keys for market cap and P/E ratio as defined in the graph
            mkt_cap = metadata.get('mkt_market_cap', 0) 
            pe_ratio = metadata.get('mkt_pe_ratio', 0)
            
            
            # Format market cap for readability
            if mkt_cap > 1e12:
                mkt_cap_str = f"${mkt_cap / 1e12:.2f}T"
            elif mkt_cap > 1e9:
                mkt_cap_str = f"${mkt_cap / 1e9:.2f}B"
            elif mkt_cap > 1e6:
                mkt_cap_str = f"${mkt_cap / 1e6:.2f}M"
            else:
                mkt_cap_str = f"${mkt_cap:,.2f}"
    
            line = f"- {name} ({ticker}): Sector={sector}, Market Cap={mkt_cap_str}, P/E Ratio={pe_ratio:.2f}"
            context_lines.append(line)
            
        return "\n".join(context_lines)

    def global_query(self, question: str, entities_from_llm: Optional[List[dict]] = None) -> str:
        """
        ENHANCED: Handles direct data questions with Yahoo Finance integration - NO LLM for final response
        """
        if not entities_from_llm:
            return None
    
        entity = entities_from_llm[0]
        metadata = entity.get('metadata', {})
        entity_name = entity.get('name', 'Unknown Company')
        cik = metadata.get('cik')
    
        available_facts = {}
        source_description = ""
        context_entity_name = entity_name
    
        # Step 1: Try comprehensive live data (SEC + Yahoo Finance)
        if cik:
            logger.info(f"Attempting live data fetch for {entity_name} (CIK: {cik})...")
            live_data = self.sec_api_tool.get_live_company_data(cik)
            
            if live_data:
                logger.info(f" Successfully fetched live data for {entity_name}")
                context_entity_name = live_data.get('entityName', entity_name)
                source_type = live_data.get('data_source', 'SEC')
                
                # Store raw data for direct access
                available_facts = {k: v for k, v in live_data.items() 
                                if isinstance(v, (int, float)) and k not in ['data_source']}
                source_description = f"Live data from {source_type}"
    
        # Step 2: If no CIK available, try direct Yahoo Finance lookup
        if not available_facts and entity_name != 'Unknown Company':
            
            yahoo_data = self.sec_api_tool.get_yahoo_finance_data(entity_name)
            
            if yahoo_data:
                logger.info(f" Successfully fetched Yahoo Finance data for {entity_name}")
                context_entity_name = yahoo_data.get('entityName', entity_name)
                
                available_facts = {k: v for k, v in yahoo_data.items() 
                                if isinstance(v, (int, float)) and k not in ['data_source']}
                source_description = "Live data from Yahoo Finance"
    
        # Step 3: Fall back to stored KG data if all else fails
        if not available_facts:
            logger.warning(f"Live data unavailable. Falling back to stored KG data for {entity_name}.")
            stored_facts = {
                "P/E Ratio": metadata.get('mkt_pe_ratio'), 
                "Market Cap": metadata.get('mkt_market_cap'),
                "Revenue": metadata.get('revenues'), 
                "Total Assets": metadata.get('total_assets'),
            }
            available_facts = {k: v for k, v in stored_facts.items() if isinstance(v, (int, float)) and v != 0}
            source_description = "Stored knowledge graph data"
    
        # Step 4: Direct response without LLM
        if not available_facts:
            return f"I could not retrieve financial data for {entity_name} from any available source."
    
        # Parse question and return specific data directly
        return self._extract_specific_data(question, available_facts, context_entity_name, source_description)
    
    def _extract_specific_data(self, question: str, facts: dict, entity_name: str, source: str) -> str:
        """Extract and return specific data based on the question without LLM processing"""
        
        question_lower = question.lower()
        
        # Define mapping of question patterns to data keys
        data_mappings = {
            # P/E Ratio variations
            ('pe ratio', 'p/e ratio', 'pe', 'price to earnings'): ['P/E Ratio', 'pe_ratio', 'mkt_pe_ratio'],
            
            # Market Cap variations  
            ('market cap', 'market capitalization', 'market value'): ['Market Cap', 'market_cap', 'mkt_market_cap'],
            
            # Price variations
            ('price', 'stock price', 'current price', 'share price'): ['Current Price', 'current_price', 'mkt_current_price'],
            
            # Revenue variations
            ('revenue', 'revenues', 'sales', 'total revenue'): ['Revenue', 'revenues', 'totalRevenue'],
            
            # Other ratios
            ('pb ratio', 'p/b ratio', 'price to book'): ['P/B Ratio', 'pb_ratio', 'mkt_pb_ratio'],
            ('dividend yield', 'dividend'): ['Dividend Yield', 'dividend_yield', 'dividendYield'],
            ('beta',): ['Beta', 'beta'],
            ('roe', 'return on equity'): ['ROE', 'roe', 'returnOnEquity'],
            
            # Financial metrics
            ('total assets', 'assets'): ['Total Assets', 'total_assets', 'totalAssets'],
            ('total debt', 'debt'): ['Total Debt', 'total_debt', 'totalDebt'],
            ('net income', 'profit', 'earnings'): ['Net Income', 'net_income', 'netIncome'],
        }
        
        # Find matching data
        for question_patterns, data_keys in data_mappings.items():
            if any(pattern in question_lower for pattern in question_patterns):
                # Look for the data in facts
                for key in data_keys:
                    if key in facts and facts[key] is not None:
                        value = facts[key]
                        
                        # Format the response based on data type
                        formatted_value = self._format_financial_value(key, value)
                        
                        return f"{entity_name}'s {question_patterns[0]} is {formatted_value}. (Source: {source})"
        
        # If no specific match, list available data
        available_metrics = []
        for key, value in list(facts.items())[:10]:  # Limit to first 10 items
            formatted_value = self._format_financial_value(key, value)
            available_metrics.append(f"{key}: {formatted_value}")
        
        return f"Available data for {entity_name}:\n" + "\n".join(available_metrics) + f"\n\n(Source: {source})"
    
    def _format_financial_value(self, key: str, value: float) -> str:
        """Format financial values appropriately"""
        
        # Handle ratios and percentages (values between 0 and 1 that aren't prices)
        if isinstance(value, float) and 0 < abs(value) < 1:
            if any(term in key.lower() for term in ['ratio', 'margin', 'yield', 'roe', 'roa']):
                if 'yield' in key.lower() or 'margin' in key.lower():
                    return f"{value:.2%}"  # Display as percentage
                else:
                    return f"{value:.2f}"  # Display as decimal
            else:
                return f"{value:.4f}"
        
        # Handle large monetary values
        elif isinstance(value, (int, float)) and abs(value) >= 1000000000:
            return f"${value/1e9:.2f}B"
        elif isinstance(value, (int, float)) and abs(value) >= 1000000:
            return f"${value/1e6:.2f}M"
        elif isinstance(value, (int, float)) and abs(value) >= 1000:
            return f"${value:,.2f}"
        
        # Handle regular numbers
        elif isinstance(value, (int, float)):
            return f"{value:,.2f}"
        
        return str(value)


    def local_query(self, question: str, entities_from_llm: Optional[List[dict]] = None, gnn_insights: Optional[dict] = None) -> str:
        entities = entities_from_llm or []
        if not entities:
            return "Please specify a company for analysis."
    
        primary_entity = entities[0]
        entity_name = primary_entity.get('name', 'Unknown Company')
        
        # Check for competitor/similarity queries
        similarity_keywords = ['similar', 'like', 'competitors', 'compete', 'peers', 'vs']
        if any(word in question.lower() for word in similarity_keywords):
            
            # PRIORITY 1: Use GNN insights if available
            if gnn_insights and gnn_insights.get('similar_companies'):
                similar_companies = gnn_insights['similar_companies']
                response_lines = [f"Based on graph neural network analysis, here are the top competitors for {entity_name}:\n"]
                for i, comp in enumerate(similar_companies[:10], 1):
                    sector = comp.get('sector', 'N/A')
                    score = comp.get('similarity_score', 0)
                    response_lines.append(f"{i}. {comp['name']} (Sector: {sector}, Similarity: {score:.3f})")
                return "\n".join(response_lines)
            
            # FALLBACK: Use E5 semantic search with enhanced query
            # ... your existing E5 logic here
            else:
                # 1. Get all available descriptive data from the entity's metadata
                entity_name = primary_entity.get('name', 'Unknown Company')
                metadata = primary_entity.get('metadata', {})
                sector = metadata.get('sector', '')
                sic_description = metadata.get('sic_description', '')
    
                # 2. Build a list of meaningful keywords. This is more robust than a fixed sentence.
                query_keywords = [entity_name]
                if sector and sector.lower() != 'other':
                    query_keywords.append(sector)
                if sic_description and sic_description.lower() not in ['company', '']:
                    # Add the description only if it's specific and not a generic fallback
                    query_keywords.append(sic_description)
    
                # Join keywords to form the final query string
                enriched_query = " ".join(query_keywords)
                logger.info(f" Semantic competitor query detected. Using keyword-based query for E5 search: '{enriched_query}'")
    
                # 3. Perform the semantic search with the new, stronger query
                similar_companies = self.kg_agent.find_similar_entities(
                    enriched_query,
                    model_type='e5',
                    top_k=11,
                    filters={"type": "company"}
                )
                
                if not similar_companies:
                    return f"Could not find any competitors for {entity_name} using the enriched query."
    
                # 4. Filter and format the response (same as before)
                response_lines = [f"Based on semantic analysis, here are the top competitors for {entity_name}:\n"]
                seen_names = {entity_name.strip().lower()}
                count = 0
                
                for company in similar_companies:
                    comp_name = company.get('name', '').strip()
                    if comp_name and comp_name.lower() not in seen_names:
                        response_lines.append(f"{count + 1}. {comp_name}")
                        seen_names.add(comp_name.lower())
                        count += 1
                    if count >= 10:
                        break
                
                if count == 0:
                    return f"No distinct competitors found for {entity_name} after filtering."
                    
                return "\n".join(response_lines)
        
        # Fallback for other non-competitor "local" queries.
        logger.info(f"Handling other local query. Using GNN insights for {primary_entity.get('name', 'Unknown')}.")
        if gnn_insights and gnn_insights.get('similar_companies'):
            similar_companies = gnn_insights['similar_companies']
            lines = [f"Structurally similar companies to {primary_entity.get('name', 'Unknown')} (from GNN):\n"]
            for i, comp in enumerate(similar_companies[:10], 1):
                lines.append(f"{i}. {comp['name']} (Score: {comp['similarity_score']:.2f})")
            return "\n".join(lines)
        else:
            return f"Unable to perform the requested local analysis for {primary_entity.get('name', 'Unknown')}."

    
    def local_query(self, question: str, entities_from_llm: Optional[List[dict]] = None, gnn_insights: Optional[dict] = None) -> str:
        """
        REVISED: This method builds a robust keyword-based query for semantic search,
        making it resilient to missing or poor-quality data points.
        """
        entities = entities_from_llm or []
        if not entities:
            return "Please specify a company for analysis."

        primary_entity = entities[0]
        
        # Build a keyword-based query 
        similarity_keywords = ['similar', 'like', 'competitors', 'compete', 'peers', 'vs']
        if any(word in question.lower() for word in similarity_keywords):
            
            # 1. Get all available descriptive data from the entity's metadata
            entity_name = primary_entity.get('name', 'Unknown Company')
            metadata = primary_entity.get('metadata', {})
            sector = metadata.get('sector', '')
            sic_description = metadata.get('sic_description', '')

            # 2. Build a list of meaningful keywords. This is more robust than a fixed sentence.
            query_keywords = [entity_name]
            if sector and sector.lower() != 'other':
                query_keywords.append(sector)
            if sic_description and sic_description.lower() not in ['company', '']:
                # Add the description only if it's specific and not a generic fallback
                query_keywords.append(sic_description)

            # Join keywords to form the final query string
            enriched_query = " ".join(query_keywords)
            logger.info(f" Semantic competitor query detected. Using keyword-based query for E5 search: '{enriched_query}'")

            # 3. Perform the semantic search with the new, stronger query
            similar_companies = self.kg_agent.find_similar_entities(
                enriched_query,
                model_type='e5',
                top_k=11,
                filters={"type": "company"}
            )
            
            if not similar_companies:
                return f"Could not find any competitors for {entity_name} using the enriched query."

            # 4. Filter and format the response (same as before)
            response_lines = [f"Based on semantic analysis, here are the top competitors for {entity_name}:\n"]
            seen_names = {entity_name.strip().lower()}
            count = 0
            
            for company in similar_companies:
                comp_name = company.get('name', '').strip()
                if comp_name and comp_name.lower() not in seen_names:
                    response_lines.append(f"{count + 1}. {comp_name}")
                    seen_names.add(comp_name.lower())
                    count += 1
                if count >= 10:
                    break
            
            if count == 0:
                return f"No distinct competitors found for {entity_name} after filtering."
                
            return "\n".join(response_lines)
       
    
# 
# GNN MODEL DEFINITION (SYNCHRONIZED WITH TRAINING SCRIPT)
# 
class AttentionBasedFinancialGNN(nn.Module):
    """ Architecture matching the training script exactly"""
    
    def __init__(self, input_dim, hidden_dim=128, num_heads=8, num_layers=3):
        super().__init__()
        self.attention_layers = nn.ModuleList()
        
        for i in range(num_layers):
            in_channels = input_dim if i == 0 else hidden_dim  
            concat = i < num_layers - 1
            
            self.attention_layers.append(
                GATv2Conv(
                    in_channels=in_channels,
                    out_channels=hidden_dim // num_heads,  # 128 // 8 = 16
                    heads=num_heads,
                    dropout=0.1,
                    concat=concat  # Only final layer doesn't concatenate
                )
            )
        
        # Final embedding dimension matches training script
        final_embedding_dim = hidden_dim // num_heads  # 16, not 128
        self.feature_predictor = nn.Linear(final_embedding_dim, 1)

    def forward(self, x, edge_index, return_attention=False):
        for i, layer in enumerate(self.attention_layers):
            x = layer(x, edge_index)
            if i < len(self.attention_layers) - 1:
                x = F.relu(x)
                x = F.dropout(x, p=0.1, training=self.training)
        
        return {
            'embeddings': x,
            'feature_prediction': self.feature_predictor(x).squeeze()
        }


# 
# LANGGRAPH NODES (UPDATED)
# 


def query_analysis_node(state: FinancialState, phi4_agent: FineTunedPhi4Agent) -> Dict[str, Any]:
    """
    FINAL VERSION: Analyzes the query, robustly extracts entities, and then resolves
    only those extracted entities against the knowledge graph.
    """
    query = state["messages"][-1].content
    kg_agent = state['kg_agent']
    logger.info(" Entering Node: query_analysis ")
    
    # Step 1: Use the LLM to extract potential entity names from the query.
    analysis = phi4_agent.analyze_query(query)
    
    resolved_entities = []
    if analysis.get("entities"):
        all_resolved_entities = []

        # Step 2: Loop through ONLY the names extracted by the LLM.
        for entity_name in analysis["entities"]:
            clean_name = re.sub(r"'s\b", "", entity_name).strip()
            if not clean_name:
                continue

            # This sanity check is still valuable.
            if len(clean_name.split()) > 6:
                logger.warning(f"Skipping resolution for implausible entity name: '{clean_name}'")
                continue

            # Step 3: Resolve the CLEANED NAME, not the whole query. 
            resolved = kg_agent.resolve_entity(clean_name)
            
            if resolved:
                logger.info(f" Resolved '{clean_name}' to '{resolved[0]['name']}'.")
                all_resolved_entities.extend(resolved)
        
        # De-duplicate and finalize the list of resolved entities
        if all_resolved_entities:
            unique_entities_dict = {
                (e.get('metadata', {}).get('cik') or e.get('id')): e for e in all_resolved_entities
            }
            resolved_entities = list(unique_entities_dict.values())
            analysis["entities"] = [e['name'] for e in resolved_entities]

    if not resolved_entities and analysis.get("query_type") == "local":
        logger.warning(f"A 'local' query was identified, but no entities could be resolved.")

    return {
        "analysis_results": analysis,
        "entities": resolved_entities
    }
    
def gnn_inference_node(state: FinancialState, gnn_model: nn.Module, kg: Any) -> Dict[str, Any]:
    """ GNN-based similarity search using neighborhood sampling."""
    if not gnn_model or not kg:
        return {"gnn_analysis": {"status": "GNN model or graph not loaded. Skipping."}}
        
    entities = state.get("entities", [])
    if not entities:
        return {"gnn_analysis": {"status": "No entities for GNN analysis."}}

    target_entity = entities[0]
    
    # 1: Better target node ID resolution
    target_node_id = None
    metadata = target_entity.get('metadata', {})
    
    # Try different ID fields in order of preference
    for id_field in ['node_id', 'cik', 'id']:
        if id_field in metadata and metadata[id_field]:
            target_node_id = str(metadata[id_field])
            break
    
    # If still no ID found, try top-level fields
    if not target_node_id:
        for id_field in ['cik', 'id']:
            if id_field in target_entity and target_entity[id_field]:
                target_node_id = str(target_entity[id_field])
                break
    
    if not target_node_id:
        return {"gnn_analysis": {"status": "Target entity node ID not found in any expected field."}}

    node_list = list(kg.graph.nodes())
    node_to_idx = {node: i for i, node in enumerate(node_list)}
    
    if target_node_id not in node_to_idx:
        logger.warning(f"Target node ID '{target_node_id}' not found in graph. Available IDs sample: {list(node_to_idx.keys())[:5]}")
        return {"gnn_analysis": {"status": f"Target entity {target_node_id} not found in graph."}}
    
    # Create the PyG Data object
    features = [kg.get_enhanced_node_features(n) for n in node_list]
    x = torch.tensor(np.array(features), dtype=torch.float)
    edges = torch.tensor([[node_to_idx[u], node_to_idx[v]] for u, v in kg.graph.edges()], dtype=torch.long).t().contiguous()
    data = Data(x=x, edge_index=edges, num_nodes=len(node_list))
    
    target_idx = node_to_idx[target_node_id]
    
    neighbor_loader = NeighborLoader(
        data,
        input_nodes=torch.tensor([target_idx]),
        num_neighbors=[15, 10],
        batch_size=1,
        replace=False,
        shuffle=False
    )
    
    logger.info(f" Running memory-efficient GNN similarity search for: {target_entity['name']} (ID: {target_node_id})")
    
    sampled_data = next(iter(neighbor_loader))
    subgraph_x = sampled_data.x.to(device)
    subgraph_edge_index = sampled_data.edge_index.to(device)
    
    logger.info(f" Subgraph size: {subgraph_x.shape[0]} nodes (vs full graph: {len(node_list)})")
    
    with torch.no_grad():
        output = gnn_model(subgraph_x, subgraph_edge_index)
        embeddings = output['embeddings']
    
    # 2: Correctly identify the target entity in the subgraph
    # The target entity should be at index 0 in the subgraph node mapping
    target_subgraph_idx = 0  # NeighborLoader puts the input node first
    target_embedding = embeddings[target_subgraph_idx].unsqueeze(0)
    
    # Calculate similarities with all nodes in the subgraph
    similarities = F.cosine_similarity(target_embedding, embeddings, dim=1)
    
    # 3: Properly exclude the target entity and filter for companies only
    similar_companies = []
    
    for subgraph_idx in range(len(similarities)):
        if subgraph_idx == target_subgraph_idx:
            continue  # Skip the target entity itself
            
        similarity_score = similarities[subgraph_idx].cpu().item()
        
        # Map back to original node
        original_node_idx = sampled_data.n_id[subgraph_idx].item()
        node_id = node_list[original_node_idx]
        node_attrs = kg.graph.nodes[node_id]
        
        # 4: Only include companies and add sector filtering
        if node_attrs.get('type') == 'company':
            target_sector = kg.graph.nodes[target_node_id].get('sector', '')
            node_sector = node_attrs.get('sector', '')
            
            # Boost similarity for same-sector companies
            if target_sector and node_sector == target_sector:
                similarity_score *= 1.2  # Boost same-sector similarity
            
            similar_companies.append({
                'name': node_attrs.get('name', 'Unknown'),
                'node_id': node_id,
                'similarity_score': float(similarity_score),
                'sector': node_sector,
                'ticker': node_attrs.get('tickers', [''])[0] if node_attrs.get('tickers') else '',
            })
    
    # 5: Sort by similarity and take top results
    similar_companies.sort(key=lambda x: x['similarity_score'], reverse=True)
    similar_companies = similar_companies[:10]  # Take top 10
    
    feature_predictions = output['feature_prediction'].cpu()
    if len(feature_predictions.shape) == 0:  # Single scalar
        pe_prediction = feature_predictions.item()
    else:  # Multiple predictions - take the target node (index 0)
        pe_prediction = feature_predictions[0].item()
    
    analysis_result = {
        'target_entity': target_entity['name'],
        'target_node_id': target_node_id,
        'similar_companies': similar_companies,
        'predicted_pe_ratio': float(pe_prediction),
        'method': 'gnn_neighborhood_sampling_fixed'
    }
    
    logger.info(f" Found {len(similar_companies)} similar companies for {target_entity['name']}")
    if similar_companies:
        # 1. First, create the list of formatted strings
        top_matches_list = [f"{c['name']} ({c['similarity_score']:.3f})" for c in similar_companies[:3]]
        # 2. Then, use the created list in the log message
        # logger.info(f"Top 3 matches: {top_matches_list}")
    if torch.cuda.is_available():
        torch.cuda.empty_cache()
    
    return {"gnn_analysis": analysis_result}




def numerical_reasoning_node(state: FinancialState, numerical_agent: EnhancedNumericalReasoning) -> Dict[str, Any]:
    """Handles numerical queries using the two-stage reasoning agent."""
    logger.info(" Entering Node: numerical_reasoning ")
    query = state["messages"][-1].content
    
    # Perform the two-stage reasoning
    result = numerical_agent.numerical_query_with_steps(query)
    
    # Format the final response for the user
    if "error" in result:
        response_text = result["error"]
    else:
        final_answer = result.get('final_result', {}).get('answer', 'Could not compute the final answer.')
        reasoning_steps = result.get('intermediate_steps', {}).get('reasoning', "No reasoning provided.")
        calculations = result.get('final_result', {}).get('final_calculations', [])
        
        response_text = (
            f"Final Answer: {final_answer}\n\n"
            f" Reasoning \n{reasoning_steps}\n\n"
            f" Calculation Steps \n" + "\n".join(calculations)
        )
            
    return {"messages": [AIMessage(content=response_text)]}



def graphrag_analysis_node(state: FinancialState, graphrag_engine: Any) -> Dict[str, Any]:
    """Generate response using GraphRAG with LLM-extracted entities."""
    try:
        query = state["messages"][-1].content
        entities = state.get("entities", [])  #  LLM-extracted entities
        gnn_analysis = state.get("gnn_analysis", {})
        query_type = state.get("analysis_results", {}).get("query_type", "global")
        
        if query_type == "global":
            # Pass LLM-extracted entities to global_query
            response = graphrag_engine.global_query(
                question=query,
                entities_from_llm=entities
            )
        else:
            #  Pass LLM-extracted entities to local_query
            response = graphrag_engine.local_query(
                question=query,
                entities_from_llm=entities,  # Use LLM entities
                gnn_insights=gnn_analysis
            )

        return {"messages": [AIMessage(content=response)]}
        
    except Exception as e:
        logger.error(f"Critical error in graphrag_analysis_node: {e}", exc_info=True)
        return {"messages": [AIMessage(content="I'm sorry, but I encountered a technical issue while processing your request. Please try again.")]}

# 
# LANGGRAPH-ENHANCED FINANCIAL SYSTEM (REFACTORED)
# 

class LangGraphFinancialSystem:
    """Inference-focused system using pre-trained embeddings and GNN models."""
    
    def __init__(self, config: SystemConfig):
        self.config = config
        logger.info(" Initializing inference-ready Financial AI System...")

        self._initialize_ai_agents()
        self._initialize_graph_components()
        
        self.workflow = self._build_workflow()
        self.query_cache = LRUCache(maxsize=100)
        
        logger.info(" System initialized successfully.")
        self.get_system_status()

    def _initialize_ai_agents(self):
        """Initializes the core LLM and embedding models."""
        self.phi4_agent = FineTunedPhi4Agent(self.config)
        self.e5_embedder = MultilingualE5Embedder(self.config.embedding_model)
        self.numerical_agent = EnhancedNumericalReasoning(self.phi4_agent)
        self.sec_api_tool = SECAPITool(user_agent=self.config.sec_user_agent)

    def _initialize_graph_components(self):
        """Loads all graph-related assets: embeddings, GNN model, and RAG engine."""
        self.kg_agent = UnifiedKnowledgeGraphAgent(self.config, self.e5_embedder)
        
        self.gnn_model, self.kg_object = self._load_trained_gnn()
        
        self.graphrag_engine = self._build_graphrag_engine()
        
    def _load_trained_gnn(self) -> Tuple[Optional[nn.Module], Optional[Any]]:
        """Finds the latest training output and loads the GNN model and KG object."""
        try:
            base_dir = "./"
            training_dirs = sorted(
                [d for d in os.listdir(base_dir) if d.startswith("trained_models_") and os.path.isdir(d)],
                reverse=True
            )
            if not training_dirs:
                raise FileNotFoundError("No 'trained_models' directory found.")

            latest_dir = training_dirs[0]
            model_path = os.path.join(latest_dir, "attention_gnn.pth")
            kg_path = os.path.join(latest_dir, "knowledge_graph.pkl")

            if not os.path.exists(model_path) or not os.path.exists(kg_path):
                raise FileNotFoundError(f"Model or KG not found in the latest directory: {latest_dir}")

            with open(kg_path, 'rb') as f:
                kg = pickle.load(f)

            input_dim = len(kg.feature_names)
            
            gnn_model = AttentionBasedFinancialGNN(input_dim=input_dim)
            gnn_model.load_state_dict(torch.load(model_path, map_location=device))
            gnn_model.to(device)
            gnn_model.eval()
            
            logger.info(f" Loaded trained GNN and KG from '{latest_dir}'")
            return gnn_model, kg
        except Exception as e:
            logger.error(f"Failed to load trained GNN: {e}", exc_info=True)
            return None, None

    def _build_graphrag_engine(self):
        """Builds the GraphRAG engine using the loaded knowledge graph."""
        if not self.kg_object:
            logger.warning("Knowledge Graph object not loaded. Cannot build GraphRAG engine.")
            return None
        
        communities = self._build_communities(self.kg_object.graph)
        if not communities:
             logger.warning("Could not form communities. GraphRAG global search will be limited.")
             # Still return an engine, it can handle local queries without communities
             
        return FinancialGraphRAGQueryEngine(
            kg_agent=self.kg_agent,
            phi4_agent=self.phi4_agent,
            communities=communities or {},
            sec_api_tool=self.sec_api_tool
        )
    
    def _build_communities(self, graph: nx.Graph) -> dict:
        """Builds and summarizes communities using the Louvain algorithm."""
        if graph.number_of_edges() == 0: 
            logger.warning("Graph has no edges. Cannot build communities.")
            return {}
        
        try:
            # Use Louvain algorithm which doesn't require flash attention
            communities_list = louvain_communities(graph.to_undirected(), seed=42)
            communities = {}
            
            for i, community_nodes in enumerate(communities_list):
                if len(community_nodes) > 2:  # Only summarize meaningful communities
                    entity_names = [graph.nodes[node].get('name', '') for node in community_nodes]
                    entity_names = [name for name in entity_names if name]
                    
                    if entity_names:  # Only create summary if we have valid entity names
                        prompt = f"Summarize this group of companies in a single sentence under 20 words: {', '.join(entity_names[:10])}"
                        
                        # FIX: Add try-catch around LLM generation to prevent community failures
                        try:
                            summary = self.phi4_agent.generate(prompt, max_new_tokens=50, temperature=0.1)
                            if summary and summary.strip():  # Only add if summary is valid
                                communities[i] = {'summary': summary, 'entities': list(community_nodes)}
                        except Exception as e:
                            logger.warning(f"Failed to generate summary for community {i}: {e}")
                            # Create a simple fallback summary
                            communities[i] = {'summary': f"Community of {len(entity_names)} companies", 'entities': list(community_nodes)}
            
            logger.info(f"Successfully created {len(communities)} communities")
            return communities
            
        except Exception as e:
            logger.error(f"Community detection failed: {e}")
            return {}


    def _build_workflow(self):
        """Constructs the agentic workflow with a new branch for numerical reasoning."""
        workflow = StateGraph(FinancialState)

        # 1. Define nodes for each step
        analysis_node = partial(query_analysis_node, phi4_agent=self.phi4_agent)
        gnn_node = partial(gnn_inference_node, gnn_model=self.gnn_model, kg=self.kg_object)
        rag_node = partial(graphrag_analysis_node, graphrag_engine=self.graphrag_engine)
        # ADD a new node for numerical reasoning
        numerical_node = partial(numerical_reasoning_node, numerical_agent=self.numerical_agent)

        # 2. Add all nodes to the graph
        workflow.add_node("query_analysis", analysis_node)
        workflow.add_node("gnn_inference", gnn_node)
        workflow.add_node("generate_response", rag_node)
        workflow.add_node("numerical_reasoning", numerical_node) # ADD the new node

        # 3. Define the new 3-way routing logic
        def route_query(state: FinancialState) -> Literal["local_path", "global_path", "numerical_path"]:
            query_type = state.get("analysis_results", {}).get("query_type")
            if query_type == "numerical":
                logger.info("Decision: Routing to numerical path.")
                return "numerical_path"
            elif query_type == "local" and state.get("entities"):
                logger.info("Decision: Routing to local path (GNN).")
                return "local_path"
            else:
                logger.info("Decision: Routing to global path (RAG).")
                return "global_path"
        
        # 4. Construct the graph with the new conditional routing
        workflow.set_entry_point("query_analysis")
        workflow.add_conditional_edges(
            "query_analysis",
            route_query,
            {
                "local_path": "gnn_inference",
                "global_path": "generate_response",
                "numerical_path": "numerical_reasoning" # ADD the new path
            }
        )
        
        # 5. Define the end points for all branches
        workflow.add_edge("gnn_inference", "generate_response")
        workflow.add_edge("generate_response", END)
        workflow.add_edge("numerical_reasoning", END) # ADD the end for the new path
        
        return workflow.compile()


    def stream_query(self, query: str):
        """
        MODIFIED: Streams the workflow execution, yielding node-keyed updates.
        """
        if not self.workflow:
            yield {"error": "Workflow not compiled."}
            return
            
        initial_state = {
            "messages": [HumanMessage(content=query)],
            "kg_agent": self.kg_agent
        }
        
        # FIX: Use stream_mode="updates" to get events like {"node_name": ...}
        for event in self.workflow.stream(initial_state, stream_mode="updates"):
            yield event

    def get_system_status(self):
        status = {
            "available_embedding_models": self.kg_agent.get_available_model_types(),
            "total_entities_loaded": len(self.kg_agent.node_attributes),
            "gnn_model_loaded": self.gnn_model is not None,
            "graphrag_engine_ready": self.graphrag_engine is not None,
        }
        logger.info(f" System Status: {status}")
        return status

# 
# UTILITY FUNCTIONS
# 

def search_companies_by_criteria(system: LangGraphFinancialSystem, criteria: List[str]):
    """Search companies by various criteria using the unified system"""
    
    results = {}
    
    # Search by market cap
    if 'large_cap' in criteria:
        results['large_cap'] = system.kg_agent.storage.search_similar_entities(
            query_text="large company",
            filters={"$and": [{"market_cap": {"$gt": 10e9}}, {"type": "company"}]},
            top_k=10
        )
    
    # Search by sector
    if 'tech' in criteria:
        results['tech'] = system.kg_agent.storage.search_similar_entities(
            query_text="technology software computer",
            filters={"type": "company"},
            top_k=10
        )
    
    # Search by financial metrics
    if 'high_pe' in criteria:
        results['high_pe'] = system.kg_agent.storage.search_similar_entities(
            query_text="growth stock",
            filters={"$and": [{"pe_ratio": {"$gt": 25}}, {"type": "company"}]},
            top_k=10
        )
    
    return results


# 
# MAIN EXECUTION (INTERACTIVE) - FINAL CORRECTED VERSION
# 

if __name__ == "__main__":
    print(" Initializing Financial AI System...")
    try:
        config = SystemConfig()
        system = LangGraphFinancialSystem(config)

        if not system.workflow:
            raise RuntimeError("System initialization failed. Workflow could not be compiled.")

        print("\n System is ready. Enter your financial query or type 'exit' to quit.")
        
        while True:
            query = input("\n> ")
            if query.lower() in ['exit', 'quit']:
                print("Exiting system. Goodbye!")
                break
            
            if not query.strip():
                continue
            
            print("Processing your query...")
            
            
            # Keep the latest state on every iteration to handle cases
            # where the stream ends without an explicit "__end__" event.
        
            acc_state = {
                "messages": [HumanMessage(content=query)],
                "kg_agent": system.kg_agent,
            }
            final_state = None

            for event in system.stream_query(query):
                if "__end__" in event:
                    print("  -> Step completed: __end__")
                    # On the final event, the acc_state is the final_state
                    final_state = acc_state
                    break

                node, payload = next(iter(event.items()))
                print(f"  -> Step completed: {node}")
                acc_state.update(payload)

                # Continuously update final_state with the latest accumulated state
                final_state = acc_state
            
            # This final assignment is a safeguard in case the loop finishes
            # without ever assigning final_state inside the loop.
            final_state = final_state or acc_state

            print("\n" + "="*50 + " FINAL RESPONSE " + "="*50)
            # Safely access messages from the guaranteed final_state
            if final_state and final_state.get("messages"):
                print(final_state["messages"][-1].content)
            else:
                print("ERROR: The workflow failed to produce a final message list.")
            print("="*116)
            # =

    except Exception as e:
        logger.error(f"A critical error occurred: {e}", exc_info=True)

  from .autonotebook import tqdm as notebook_tqdm
E0000 00:00:1755709504.032655    2568 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1755709504.060730    2568 cuda_blas.cc:1407] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
W0000 00:00:1755709504.256357    2568 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1755709504.256400    2568 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1755709504.256403    2568 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1755709504.256405    2568 computation_placer.cc:177] computa

 Initializing Financial AI System...


INFO:accelerate.utils.modeling:We will use 90% of the memory on device 0 for storing the model, and 10% for the buffer to avoid OOM. You can set `max_memory` in to a higher value to use more memory (at your own risk).
Loading checkpoint shards: 100%|██████████| 6/6 [02:10<00:00, 21.79s/it]
INFO:__main__:Fine-tuned Phi-4 model loaded successfully
INFO:__main__:Loaded intfloat/multilingual-e5-large on cuda
INFO:__main__:Connected to Qdrant local mode at ./financial_embeddings_db
INFO:__main__: Loaded a total of 17552 points from collection 'financial_entities_graphsage'
INFO:__main__: Loaded 17552 graphsage embeddings
INFO:__main__: Loaded a total of 17552 points from collection 'financial_entities_attention_gnn'
INFO:__main__: Loaded 17552 attention_gnn embeddings
INFO:__main__: Loaded a total of 17552 points from collection 'financial_entities_temporal_gnn'
INFO:__main__: Loaded 17552 temporal_gnn embeddings
INFO:__main__: Loaded a total of 16333 points from collection 'financial_entit


 System is ready. Enter your financial query or type 'exit' to quit.



>   Show me Microsoft's stock price?


INFO:__main__: Entering Node: query_analysis 


Processing your query...


INFO:__main__:Attempting exact match for 'microsoft corporation'...
INFO:__main__:Attempting case-insensitive match for 'Microsoft Corporation'...
  search_results = self.client.search(
INFO:__main__: Resolved 'microsoft corporation' to 'MICROSOFT CORP'.
INFO:__main__:Decision: Routing to global path (RAG).
INFO:__main__: ticker MSFT found for MICROSOFT CORP


  -> Step completed: query_analysis


INFO:__main__: Successfully fetched Yahoo Finance data for MICROSOFT CORP


  -> Step completed: generate_response

Microsoft Corporation's price is 505.25. (Source: Live data from Yahoo Finance)



>  What is Delta's dividend yield?


INFO:__main__: Entering Node: query_analysis 


Processing your query...


INFO:__main__:Attempting exact match for 'delta airlines inc'...
INFO:__main__:Attempting case-insensitive match for 'Delta Airlines Inc'...
  search_results = self.client.search(
INFO:__main__: Resolved 'delta airlines inc' to 'DELTA AIR LINES, INC.'.
INFO:__main__:Decision: Routing to global path (RAG).
INFO:__main__: ticker DAL found for DELTA AIR LINES, INC.


  -> Step completed: query_analysis


INFO:__main__: Successfully fetched Yahoo Finance data for DELTA AIR LINES, INC.


  -> Step completed: generate_response

Delta Air Lines, Inc.'s dividend yield is 1.24. (Source: Live data from Yahoo Finance)



>   What is Amazon's revenue?


INFO:__main__: Entering Node: query_analysis 


Processing your query...


INFO:__main__:Attempting exact match for 'amazon.com inc'...
INFO:__main__:Attempting case-insensitive match for 'Amazon.Com Inc'...
  search_results = self.client.search(
INFO:__main__: Resolved 'amazon.com inc' to 'AMAZON COM INC'.
INFO:__main__:Decision: Routing to global path (RAG).
INFO:__main__: ticker AMZN.MX found for AMAZON COM INC


  -> Step completed: query_analysis


INFO:__main__: Successfully fetched Yahoo Finance data for AMAZON COM INC


  -> Step completed: generate_response

Amazon.com, Inc.'s revenue is $670.04B. (Source: Live data from Yahoo Finance)



>  Show me Netflix's beta coefficient


INFO:__main__: Entering Node: query_analysis 


Processing your query...


INFO:__main__:Attempting exact match for 'netflix inc'...
INFO:__main__:Attempting case-insensitive match for 'Netflix Inc'...
  search_results = self.client.search(
INFO:__main__: Resolved 'netflix inc' to 'NETFLIX INC'.
INFO:__main__:Decision: Routing to global path (RAG).
INFO:__main__: ticker NFLX found for NETFLIX INC


  -> Step completed: query_analysis


INFO:__main__: Successfully fetched Yahoo Finance data for NETFLIX INC


  -> Step completed: generate_response

Netflix, Inc.'s beta is 1.59. (Source: Live data from Yahoo Finance)



>  What is JPMorgan Chase's total debt?


INFO:__main__: Entering Node: query_analysis 


Processing your query...


INFO:__main__:Attempting exact match for 'jpmorgan chase & co'...
INFO:__main__:Attempting case-insensitive match for 'Jpmorgan Chase & Co'...
  search_results = self.client.search(
INFO:__main__: Resolved 'jpmorgan chase & co' to 'JPMORGAN CHASE & CO'.
INFO:__main__:Decision: Routing to global path (RAG).
INFO:__main__: ticker JPM found for JPMORGAN CHASE & CO


  -> Step completed: query_analysis


INFO:__main__: Successfully fetched Yahoo Finance data for JPMORGAN CHASE & CO


  -> Step completed: generate_response

JPMorgan Chase & Co.'s total debt is $1165.54B. (Source: Live data from Yahoo Finance)



>  What is Nvidia's return on equity?


INFO:__main__: Entering Node: query_analysis 


Processing your query...


INFO:__main__:Attempting exact match for 'nvidia corporation'...
INFO:__main__:Attempting case-insensitive match for 'Nvidia Corporation'...
  search_results = self.client.search(
INFO:__main__: Resolved 'nvidia corporation' to 'NVIDIA CORP'.
INFO:__main__:Decision: Routing to global path (RAG).
INFO:__main__: ticker NVDA found for NVIDIA CORP


  -> Step completed: query_analysis


INFO:__main__: Successfully fetched Yahoo Finance data for NVIDIA CORP


  -> Step completed: generate_response

NVIDIA Corporation's roe is 1.15. (Source: Live data from Yahoo Finance)



>  What is Tesla's current stock price?


INFO:__main__: Entering Node: query_analysis 


Processing your query...


INFO:__main__:Attempting exact match for 'What is Tesla current stock price'...
INFO:__main__:Attempting case-insensitive match for 'What Is Tesla Current Stock Price'...
  search_results = self.client.search(
INFO:__main__: Resolved 'What is Tesla current stock price' to 'Tesla, Inc.'.
INFO:__main__:Decision: Routing to global path (RAG).


  -> Step completed: query_analysis


INFO:__main__: ticker TSLA found for Tesla, Inc.
INFO:__main__: Successfully fetched Yahoo Finance data for Tesla, Inc.


  -> Step completed: generate_response

Tesla, Inc.'s price is 321.83. (Source: Live data from Yahoo Finance)



>  What is AMD's return on equity?


INFO:__main__: Entering Node: query_analysis 


Processing your query...


INFO:__main__:Attempting exact match for 'amd'...
INFO:__main__:Attempting case-insensitive match for 'Amd'...
  search_results = self.client.search(
INFO:__main__: Resolved 'amd' to 'ADVANCED MICRO DEVICES INC'.
INFO:__main__:Decision: Routing to global path (RAG).
INFO:__main__: ticker AMD found for ADVANCED MICRO DEVICES INC


  -> Step completed: query_analysis


INFO:__main__: Successfully fetched Yahoo Finance data for ADVANCED MICRO DEVICES INC


  -> Step completed: generate_response

Advanced Micro Devices, Inc.'s roe is 0.05. (Source: Live data from Yahoo Finance)



>  What is Delta's price to book ratio?


INFO:__main__: Entering Node: query_analysis 


Processing your query...


INFO:__main__:Attempting exact match for 'delta airlines inc'...
INFO:__main__:Attempting case-insensitive match for 'Delta Airlines Inc'...
  search_results = self.client.search(
INFO:__main__: Resolved 'delta airlines inc' to 'DELTA AIR LINES, INC.'.
INFO:__main__:Decision: Routing to global path (RAG).
INFO:__main__: ticker DAL found for DELTA AIR LINES, INC.
INFO:__main__: Successfully fetched Yahoo Finance data for DELTA AIR LINES, INC.


  -> Step completed: query_analysis
  -> Step completed: generate_response

Delta Air Lines, Inc.'s price is 59.25. (Source: Live data from Yahoo Finance)



>  What is the P/E ratio of Apple?


INFO:__main__: Entering Node: query_analysis 


Processing your query...


INFO:__main__:Attempting exact match for 'apple inc'...
INFO:__main__:Attempting case-insensitive match for 'Apple Inc'...
  search_results = self.client.search(
INFO:__main__: Resolved 'apple inc' to 'Apple Inc.'.
INFO:__main__:Decision: Routing to global path (RAG).
INFO:__main__: ticker AAPL found for Apple Inc.


  -> Step completed: query_analysis


INFO:__main__: Successfully fetched Yahoo Finance data for Apple Inc.


  -> Step completed: generate_response

Apple Inc.'s pe ratio is 34.41. (Source: Live data from Yahoo Finance)



>  Find companies similar to Apple


INFO:__main__: Entering Node: query_analysis 


Processing your query...


INFO:__main__:Attempting exact match for 'Find companies similar to Apple'...
INFO:__main__:Attempting case-insensitive match for 'Find Companies Similar To Apple'...
  search_results = self.client.search(
INFO:__main__: Resolved 'Find companies similar to Apple' to 'Apple Inc.'.
INFO:__main__:Decision: Routing to local path (GNN).


  -> Step completed: query_analysis


INFO:__main__: Running memory-efficient GNN similarity search for: Apple Inc. (ID: 320193)
INFO:__main__: Subgraph size: 161 nodes (vs full graph: 17552)
INFO:__main__: Found 10 similar companies for Apple Inc.
INFO:__main__: Semantic competitor query detected. Using keyword-based query for E5 search: 'Apple Inc. Electronic Computers'


  -> Step completed: gnn_inference
  -> Step completed: generate_response

Based on semantic analysis, here are the top competitors for Apple Inc.:

1. DELL INC
2. Silicon Graphics International Corp
3. EVANS & SUTHERLAND COMPUTER CORP
4. ELECTRONICS FOR IMAGING INC
5. ECHELON CORP
6. DigitalTown, Inc.
7. Sector 5, Inc.
8. EMC CORP
9. QLOGIC CORP
10. CRAY INC



>  Who are Tesla's main competitors?


INFO:__main__: Entering Node: query_analysis 


Processing your query...


INFO:__main__:Attempting exact match for 'tesla inc'...
INFO:__main__:Attempting case-insensitive match for 'Tesla Inc'...
  search_results = self.client.search(
INFO:__main__: Resolved 'tesla inc' to 'Tesla, Inc.'.
INFO:__main__:Decision: Routing to local path (GNN).


  -> Step completed: query_analysis


INFO:__main__: Running memory-efficient GNN similarity search for: Tesla, Inc. (ID: 1318605)
INFO:__main__: Subgraph size: 162 nodes (vs full graph: 17552)
INFO:__main__: Found 10 similar companies for Tesla, Inc.
INFO:__main__: Semantic competitor query detected. Using keyword-based query for E5 search: 'Tesla, Inc. Motor Vehicles & Passenger Car Bodies'


  -> Step completed: gnn_inference
  -> Step completed: generate_response

Based on semantic analysis, here are the top competitors for Tesla, Inc.:

1. Arrival
2. ELECTRAMECCANICA VEHICLES CORP.
3. Lightning eMotors, Inc.
4. IDEANOMICS, INC.
5. TATA MOTORS LTD/FI
6. T3M INC.
7. TOYOTA MOTOR CORP/
8. Leo Motors, Inc.
9. FCA US LLC
10. Motors Liquidation Co



>  Find peers of JPMorgan Chase


INFO:__main__: Entering Node: query_analysis 


Processing your query...


INFO:__main__:Attempting exact match for 'Find peers of JPMorgan Chase'...
INFO:__main__:Attempting case-insensitive match for 'Find Peers Of Jpmorgan Chase'...
  search_results = self.client.search(
INFO:__main__: Resolved 'Find peers of JPMorgan Chase' to 'JPMORGAN CHASE & CO'.
INFO:__main__:Decision: Routing to local path (GNN).


  -> Step completed: query_analysis


INFO:__main__: Running memory-efficient GNN similarity search for: JPMORGAN CHASE & CO (ID: 19617)
INFO:__main__: Subgraph size: 166 nodes (vs full graph: 17552)
INFO:__main__: Found 10 similar companies for JPMORGAN CHASE & CO
INFO:__main__: Semantic competitor query detected. Using keyword-based query for E5 search: 'JPMORGAN CHASE & CO National Commercial Banks'


  -> Step completed: gnn_inference
  -> Step completed: generate_response

Based on semantic analysis, here are the top competitors for JPMORGAN CHASE & CO:

1. National Commerce Corp
2. COMMERCIAL NATIONAL FINANCIAL CORP /PA
3. COMMUNITYCORP
4. JEFFERSONVILLE BANCORP
5. CITIGROUP INC
6. Professional Holding Corp.
7. CHEVIOT FINANCIAL CORP
8. CITY NATIONAL CORP
9. Chino Commercial Bancorp
10. PEOPLES BANCORP INC/MD



>  Show me companies like goldman sachs


INFO:__main__: Entering Node: query_analysis 


Processing your query...


INFO:__main__:Attempting exact match for 'Show me companies like goldman sachs'...
INFO:__main__:Attempting case-insensitive match for 'Show Me Companies Like Goldman Sachs'...
  search_results = self.client.search(
INFO:__main__: Resolved 'Show me companies like goldman sachs' to 'GOLDMAN SACHS GROUP INC'.
INFO:__main__:Decision: Routing to local path (GNN).


  -> Step completed: query_analysis


INFO:__main__: Running memory-efficient GNN similarity search for: GOLDMAN SACHS GROUP INC (ID: 886982)
INFO:__main__: Subgraph size: 165 nodes (vs full graph: 17552)
INFO:__main__: Found 10 similar companies for GOLDMAN SACHS GROUP INC
INFO:__main__: Semantic competitor query detected. Using keyword-based query for E5 search: 'GOLDMAN SACHS GROUP INC Security Brokers, Dealers & Flotation Companies'


  -> Step completed: gnn_inference
  -> Step completed: generate_response

Based on semantic analysis, here are the top competitors for GOLDMAN SACHS GROUP INC:

1. GILMAN CIOCIA, INC.
2. SCHWAB CHARLES CORP
3. MORGAN STANLEY
4. Jefferies Group LLC
5. SUMMIT FINANCIAL SERVICES GROUP INC
6. GLEACHER & COMPANY, INC.
7. Cohen & Co Inc.
8. GREENHILL & CO INC
9. GAMCO INVESTORS, INC. ET AL
10. PENSON WORLDWIDE INC



>  Find companies that compete with Walmart


INFO:__main__: Entering Node: query_analysis 


Processing your query...


INFO:__main__:Attempting exact match for 'Find companies that compete with Walmart'...
INFO:__main__:Attempting case-insensitive match for 'Find Companies That Compete With Walmart'...
  search_results = self.client.search(
INFO:__main__: Resolved 'Find companies that compete with Walmart' to 'Walmart Inc.'.
INFO:__main__:Decision: Routing to local path (GNN).


  -> Step completed: query_analysis


INFO:__main__: Running memory-efficient GNN similarity search for: Walmart Inc. (ID: 104169)
INFO:__main__: Subgraph size: 163 nodes (vs full graph: 17552)
INFO:__main__: Found 10 similar companies for Walmart Inc.
INFO:__main__: Semantic competitor query detected. Using keyword-based query for E5 search: 'Walmart Inc. Retail-Variety Stores'


  -> Step completed: gnn_inference
  -> Step completed: generate_response

Based on semantic analysis, here are the top competitors for Walmart Inc.:

1. ALCO STORES INC
2. TUESDAY MORNING CORP/DE
3. BIG LOTS INC
4. PRICESMART INC
5. TARGET CORP
6. FREDS INC
7. WALGREEN CO
8. STAPLES INC
9. FAMILY DOLLAR STORES INC
10. WINMARK CORP



>  What companies are like Coca-Cola?


INFO:__main__: Entering Node: query_analysis 


Processing your query...


INFO:__main__:Attempting exact match for 'What companies are like Coca-Cola'...
INFO:__main__:Attempting case-insensitive match for 'What Companies Are Like Coca-Cola'...
  search_results = self.client.search(
INFO:__main__: Resolved 'What companies are like Coca-Cola' to 'Coca-Cola Consolidated, Inc.'.
INFO:__main__:Decision: Routing to local path (GNN).


  -> Step completed: query_analysis


INFO:__main__: Running memory-efficient GNN similarity search for: Coca-Cola Consolidated, Inc. (ID: 317540)
INFO:__main__: Subgraph size: 164 nodes (vs full graph: 17552)
INFO:__main__: Found 10 similar companies for Coca-Cola Consolidated, Inc.
INFO:__main__: Semantic competitor query detected. Using keyword-based query for E5 search: 'Coca-Cola Consolidated, Inc. Bottled & Canned Soft Drinks & Carbonated Waters'


  -> Step completed: gnn_inference
  -> Step completed: generate_response

Based on semantic analysis, here are the top competitors for Coca-Cola Consolidated, Inc.:

1. COCA-COLA REFRESHMENTS USA, INC.
2. COCA COLA FEMSA SAB DE CV
3. COCA-COLA EUROPEAN PARTNERS US, LLC
4. COCA-COLA EUROPACIFIC PARTNERS plc
5. ANDINA BOTTLING CO INC
6. Primo Water Corp /CN/
7. Cell-nique Corp
8. Celsius Holdings, Inc.
9. NATIONAL BEVERAGE CORP
10. Blue Line Holdings, Inc.



>  What pharmaceutical companies are like Pfizer?


INFO:__main__: Entering Node: query_analysis 


Processing your query...


INFO:__main__:Attempting exact match for 'What pharmaceutical companies are like Pfizer'...
INFO:__main__:Attempting case-insensitive match for 'What Pharmaceutical Companies Are Like Pfizer'...
  search_results = self.client.search(
INFO:__main__: Resolved 'What pharmaceutical companies are like Pfizer' to 'PFIZER INC'.
INFO:__main__:Decision: Routing to local path (GNN).


  -> Step completed: query_analysis


INFO:__main__: Running memory-efficient GNN similarity search for: PFIZER INC (ID: 78003)
INFO:__main__: Subgraph size: 109 nodes (vs full graph: 17552)
INFO:__main__: Found 10 similar companies for PFIZER INC
INFO:__main__: Semantic competitor query detected. Using keyword-based query for E5 search: 'PFIZER INC Pharmaceutical Preparations'


  -> Step completed: gnn_inference
  -> Step completed: generate_response

Based on semantic analysis, here are the top competitors for PFIZER INC:

1. POZEN INC /NC
2. PHYSICIANS FORMULA HOLDINGS, INC.
3. PHARMACYCLICS INC
4. Pfenex Inc.
5. PSYENCE BIOMEDICAL LTD.
6. MEDICINES CO /DE
7. ADVANZ PHARMA Corp.
8. Pharmasset Inc
9. PHASERX, INC.
10. MEDICURE INC



>  What companies are similar to toyota?


INFO:__main__: Entering Node: query_analysis 


Processing your query...


INFO:__main__:Attempting exact match for 'What companies are similar to toyota'...
INFO:__main__:Attempting case-insensitive match for 'What Companies Are Similar To Toyota'...
  search_results = self.client.search(
INFO:__main__: Resolved 'What companies are similar to toyota' to 'TOYOTA MOTOR CORP/'.
INFO:__main__:Decision: Routing to local path (GNN).


  -> Step completed: query_analysis


INFO:__main__: Running memory-efficient GNN similarity search for: TOYOTA MOTOR CORP/ (ID: 1094517)
INFO:__main__: Subgraph size: 160 nodes (vs full graph: 17552)
INFO:__main__: Found 10 similar companies for TOYOTA MOTOR CORP/
INFO:__main__: Semantic competitor query detected. Using keyword-based query for E5 search: 'TOYOTA MOTOR CORP/ Motor Vehicles & Passenger Car Bodies'


  -> Step completed: gnn_inference
  -> Step completed: generate_response

Based on semantic analysis, here are the top competitors for TOYOTA MOTOR CORP/:

1. TATA MOTORS LTD/FI
2. Arrival
3. Motors Liquidation Co
4. T3M INC.
5. FORD MOTOR CO
6. TOYOTA MOTOR CREDIT CORP
7. General Motors Co
8. Proterra Inc
9. IDEANOMICS, INC.
10. HONDA MOTOR CO LTD



>  Show me banks similar to Wells Fargo


INFO:__main__: Entering Node: query_analysis 


Processing your query...


INFO:__main__:Attempting exact match for 'wells fargo & co'...
INFO:__main__:Attempting case-insensitive match for 'Wells Fargo & Co'...
  search_results = self.client.search(
INFO:__main__: Resolved 'wells fargo & co' to 'WELLS FARGO & COMPANY/MN'.
INFO:__main__:Decision: Routing to local path (GNN).


  -> Step completed: query_analysis


INFO:__main__: Running memory-efficient GNN similarity search for: WELLS FARGO & COMPANY/MN (ID: 72971)
INFO:__main__: Subgraph size: 164 nodes (vs full graph: 17552)
INFO:__main__: Found 10 similar companies for WELLS FARGO & COMPANY/MN
INFO:__main__: Semantic competitor query detected. Using keyword-based query for E5 search: 'WELLS FARGO & COMPANY/MN National Commercial Banks'


  -> Step completed: gnn_inference
  -> Step completed: generate_response

Based on semantic analysis, here are the top competitors for WELLS FARGO & COMPANY/MN:

1. MIDDLEBURG FINANCIAL CORP
2. NORWOOD FINANCIAL CORP
3. CITIZENS & NORTHERN CORP
4. MW Bancorp, Inc.
5. CULLEN/FROST BANKERS, INC.
6. MID WISCONSIN FINANCIAL SERVICES INC
7. NORTHERN STATES FINANCIAL CORP /DE/
8. FB Corp
9. MARSHALL & ILSLEY CORP
10. PEOPLES BANCORP INC/MD



>  Find semiconductor companies like nvidia


INFO:__main__: Entering Node: query_analysis 


Processing your query...


INFO:__main__:Attempting exact match for 'Find semiconductor companies like nvidia'...
INFO:__main__:Attempting case-insensitive match for 'Find Semiconductor Companies Like Nvidia'...
  search_results = self.client.search(
INFO:__main__: Resolved 'Find semiconductor companies like nvidia' to 'NVIDIA CORP'.
INFO:__main__:Decision: Routing to local path (GNN).


  -> Step completed: query_analysis


INFO:__main__: Running memory-efficient GNN similarity search for: NVIDIA CORP (ID: 1045810)
INFO:__main__: Subgraph size: 154 nodes (vs full graph: 17552)
INFO:__main__: Found 10 similar companies for NVIDIA CORP
INFO:__main__: Semantic competitor query detected. Using keyword-based query for E5 search: 'NVIDIA CORP Semiconductors & Related Devices'


  -> Step completed: gnn_inference
  -> Step completed: generate_response

Based on semantic analysis, here are the top competitors for NVIDIA CORP:

1. NEOPHOTONICS CORP
2. NATIONAL SEMICONDUCTOR CORP
3. SEMICONDUCTOR MANUFACTURING INTERNATIONAL CORP
4. LOGIC DEVICES Inc
5. ENERGY CONVERSION DEVICES INC
6. China Inc
7. E DIGITAL CORP
8. OPTI INC
9. LSI CORP
10. NXP Semiconductors N.V.



>  Who are main competitors for ford?


INFO:__main__: Entering Node: query_analysis 


Processing your query...


INFO:__main__:Attempting exact match for 'ford motor co'...
INFO:__main__:Attempting case-insensitive match for 'Ford Motor Co'...
  search_results = self.client.search(
INFO:__main__: Resolved 'ford motor co' to 'FORD MOTOR CREDIT CO LLC'.
INFO:__main__:Decision: Routing to local path (GNN).


  -> Step completed: query_analysis


INFO:__main__: Running memory-efficient GNN similarity search for: FORD MOTOR CREDIT CO LLC (ID: 38009)
INFO:__main__: Subgraph size: 131 nodes (vs full graph: 17552)
INFO:__main__: Found 10 similar companies for FORD MOTOR CREDIT CO LLC
INFO:__main__: Semantic competitor query detected. Using keyword-based query for E5 search: 'FORD MOTOR CREDIT CO LLC Miscellaneous Business Credit Institution'


  -> Step completed: gnn_inference
  -> Step completed: generate_response

Based on semantic analysis, here are the top competitors for FORD MOTOR CREDIT CO LLC:

1. Franklin Credit Management Corp
2. PHH CORP
3. TOYOTA MOTOR CREDIT CORP
4. MICROFINANCIAL INC
5. IBM CREDIT LLC
6. CATERPILLAR FINANCIAL SERVICES CORP
7. Consumer Capital Group, Inc.
8. AMERICAN HONDA FINANCE CORP
9. United Development Funding III, LP
10. CNH Industrial Capital LLC



>  Show me companies similar to Boeing


INFO:__main__: Entering Node: query_analysis 


Processing your query...


INFO:__main__:Attempting exact match for 'Show me companies similar to Boeing'...
INFO:__main__:Attempting case-insensitive match for 'Show Me Companies Similar To Boeing'...
  search_results = self.client.search(
INFO:__main__: Resolved 'Show me companies similar to Boeing' to 'BOEING CO'.
INFO:__main__:Decision: Routing to local path (GNN).


  -> Step completed: query_analysis


INFO:__main__: Running memory-efficient GNN similarity search for: BOEING CO (ID: 12927)
INFO:__main__: Subgraph size: 165 nodes (vs full graph: 17552)
INFO:__main__: Found 10 similar companies for BOEING CO
INFO:__main__: Semantic competitor query detected. Using keyword-based query for E5 search: 'BOEING CO Aircraft'


  -> Step completed: gnn_inference
  -> Step completed: generate_response

Based on semantic analysis, here are the top competitors for BOEING CO:

1. BOEING CAPITAL CORP
2. B/E AEROSPACE INC
3. Stealth Air Corp.
4. BOA Acquisition Corp.
5. HAWKER BEECHCRAFT ACQUISITION CO LLC
6. BRUNSWICK CORP
7. Bantec, Inc.
8. Copa Holdings, S.A.
9. Eviation Aircraft Ltd.
10. DUCOMMUN INC /DE/
