## **Stage-1**

Creating a knowledge base system which will act as brain to my model

In [8]:
import os 
import re
import pickle
import pandas as pd
import numpy as np
import faiss
from sentence_transformers import SentenceTransformer

## Configuration 
DEFAULT_MODEL_NAME="sentence-transformers/all-MiniLM-L6-v2"

def clean_text(text:str) -> str:
    """This function is used to clean the entire text [description]"""
    if text is None:
        return ""
    text = str(text)
    text = re.sub(r'\s+',' ',text).strip()
    return text

def safe_to_str(x):
    """This is a helper function"""
    if pd.isna(x):
        return ""
    return str(x).strip()

def build_system_kb_store_all_columns(
        excel_path: str,
        save_dir: str = "system_kb_store",
        model_name: str = DEFAULT_MODEL_NAME,
        batch_size: int = 64,
        embed_column: str = 'Description of Contract'
):
    """This function will create embedding on my description for my knowledge base"""
    os.makedirs(save_dir,exist_ok=True)
    print(f"\n Loading Excel Knowledge Base: {excel_path}")
    df = pd.read_excel(excel_path)
    print(f"Loaded rows={len(df)} col={len(df.columns)}")

    if embed_column not in df.columns:
        raise ValueError(f"Embed column '{embed_column}' not found in Excel!")
    
    df = df.fillna('')
    kb_texts=[]
    kb_meta=[]
    
    for idx, row in df.iterrows():
        desc=clean_text(row[embed_column])
        if not desc or len(desc) < 20:
            continue
        meta = {'row_id':int(idx)}
        for col in df.columns:
            meta[col]=safe_to_str(row[col])
        meta[embed_column]=desc
        kb_texts.append(desc)
        kb_meta.append(meta)
    print(f"KB rows kept after cleaning: {len(kb_texts)}")
    if len(kb_texts) == 0:
        print("ERROR:No text rows remained after cleaning. Check your 'clean_text' logic or input data.")
        return None, None
    print(f"Loading embedding model: {model_name}")
    embedder=SentenceTransformer(model_name)
    print("Creating Embeddings...")
    embeddings=[]
    for i in range(0,len(kb_texts),batch_size):
        batch=kb_texts[i:i + batch_size]
        batch_emb=embedder.encode(batch,show_progress_bar=True,normalize_embeddings=True)
        embeddings.append(batch_emb)
    embeddings=np.vstack(embeddings).astype('float32')
    dim=embeddings.shape[1]
    print(f"Embedding Shape:{embeddings.shape}")
    index=faiss.IndexFlatIP(dim)
    index.add(embeddings)
    index_path=os.path.join(save_dir,"system_kb.faiss")
    meta_path=os.path.join(save_dir,'system_kb_meta.pkl')
    faiss.write_index(index,index_path)
    with open(meta_path,'wb') as f:
        pickle.dump(kb_meta,f)
    print("System KB Created Successfully!")
    print(f"Index saved: {index_path}")
    print(f"Meta Saved: {meta_path}")
    return index_path, meta_path

if __name__=="__main__":
    EXCEL_PATH=r"C:\Users\mukeshkr\Agentic-AI-Defense-Data-Extraction\data\sample_data.xlsx"
    build_system_kb_store_all_columns(
        excel_path=EXCEL_PATH,
        save_dir='system_kb_store',
        model_name=DEFAULT_MODEL_NAME,
        batch_size=64,
        embed_column="Description of Contract"
    )




 Loading Excel Knowledge Base: C:\Users\mukeshkr\Agentic-AI-Defense-Data-Extraction\data\sample_data.xlsx
Loaded rows=2068 col=29
KB rows kept after cleaning: 600
Loading embedding model: sentence-transformers/all-MiniLM-L6-v2


'(ProtocolError('Connection aborted.', RemoteDisconnected('Remote end closed connection without response')), '(Request ID: b94d340d-9491-4534-9001-a3ba8a140304)')' thrown while requesting HEAD https://huggingface.co/sentence-transformers/all-MiniLM-L6-v2/resolve/main/./modules.json
Retrying in 1s [Retry 1/5].


Creating Embeddings...


Batches: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 2/2 [00:02<00:00,  1.32s/it]
Batches: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 2/2 [00:02<00:00,  1.28s/it]
Batches: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 2/2 [00:02<00:00,  1.20s/it]
Batches: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 2/2 [00:02<00:00,  1.22s/it]
Batches: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 2/2 [00:02<00:00,  1.27s/it]
Batches: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 2/2 [00:02<00:00,  1.32s/it]
Batches: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 2/2 [00:02<00:00,  1.31s/it]
Batches: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 2/2 [00:02<00:00,  1.35s/it]
Batches: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 2/2 [00:02<00:00,  1.28s/it]
Batches: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 1/1 [00:00<00:00,  1.01it/s]

Embedding Shape:(600, 384)
System KB Created Successfully!
Index saved: system_kb_store\system_kb.faiss
Meta Saved: system_kb_store\system_kb_meta.pkl





In [11]:
import os 
import pickle
import faiss
from sentence_transformers import SentenceTransformer

DEFAULT_MODEL_NAME='sentence-transformers/all-MiniLM-L6-v2'

class SystemKBRetriever:
    def __init__(self, kb_dir='system_kb_store', model_name=DEFAULT_MODEL_NAME):
        index_path = os.path.join(kb_dir, 'system_kb.faiss')
        meta_path = os.path.join(kb_dir, 'system_kb_meta.pkl')

        if not os.path.exists(index_path) or not os.path.exists(meta_path):
            raise FileNotFoundError("KB files are missing. Build KB first.")
        
        print(f"Loading FAISS index: {index_path}")
        self.index = faiss.read_index(index_path)
        
        print(f"Loading metadata: {meta_path}")
        with open(meta_path, 'rb') as f:
            self.meta = pickle.load(f)
            
        print(f"Loaded KB rows: {len(self.meta)}")
        print(f"Loading embedder: {model_name}")
        self.embedder = SentenceTransformer(model_name)
    
    def retrieve(self, query_text: str, top_k: int = 5):
        # FIX 1: Do not use .split(). We want to embed the whole sentence, not a list of words.
        query_text = str(query_text).strip()
        
        if not query_text:
            return []
        
        # Encode returns shape (1, 384) because we pass a list with 1 string
        q_emb = self.embedder.encode([query_text], normalize_embeddings=True).astype('float32')
        
        scores, idxs = self.index.search(q_emb, top_k)
        results = []
        
        # FIX 2: Corrected typo 'socre' -> 'score'
        for score, idx in zip(scores[0], idxs[0]):
            if idx < 0:
                continue
            results.append({
                'score': float(score),  # FIX 3: Use individual 'score', not the array 'scores'
                "meta": self.meta[idx]
            })
        return results
    
if __name__=="__main__":
    # Ensure the directory exists and contains files created by the builder script
    if os.path.exists('system_kb_store'):
        r = SystemKBRetriever(kb_dir='system_kb_store')
        q = 'Dell Marketing L.P., Round Rock, Texas, is awarded a single-award, firm-fixed-price blanket purchase agreement'
        
        hits = r.retrieve(q, top_k=3)

        for h in hits:
            print("\nScore:", h['score'])
            print("Supplier:", h['meta'].get('Supplier Name'))
            print("Market:", h['meta'].get("Market Segment"))
            print("System:", h['meta'].get('System Name (Specific)'))
    else:
        print("Error: 'system_kb_store' directory not found. Please run the builder script first.")

'(ProtocolError('Connection aborted.', RemoteDisconnected('Remote end closed connection without response')), '(Request ID: a7b0a49c-5ed7-4137-a283-22884e9d1ce4)')' thrown while requesting HEAD https://huggingface.co/sentence-transformers/all-MiniLM-L6-v2/resolve/main/./modules.json
Retrying in 1s [Retry 1/5].


Loading FAISS index: system_kb_store\system_kb.faiss
Loading metadata: system_kb_store\system_kb_meta.pkl
Loaded KB rows: 600
Loading embedder: sentence-transformers/all-MiniLM-L6-v2

Score: 0.6228203773498535
Supplier: Dell Inc
Market: Unknown
System: Department of Defense Enterprise Software Initiative (DOD ESI)

Score: 0.6228203773498535
Supplier: Dell Inc
Market: Unknown
System: Department of Defense Enterprise Software Initiative (DOD ESI)

Score: 0.6024371385574341
Supplier: 
Market: Unknown
System: Department of Defense Enterprise Software Initiative (DOD ESI)


In [1]:
## Final Code 

"""
================================================================================
AGENTIC DEFENSE CONTRACT EXTRACTION PIPELINE (0 -> END)
================================================================================

Author: Mukesh 

This pipeline extracts defense contract structured information from raw paragraphs
(typically DoD / defense news contract announcements) using:

Stage1: Sourcing Extractor (Base row skeleton)
Stage2: Geography Extractor (Countries, operator, regions)
Stage3: System Classifier (RAG + Taxonomy + Evidence + Reason)
Stage4: Contract Extractor (Supplier, program type, value, quantity, G2G/B2G)
Stage5: Split Engine (Operator + FMS country splitting only; NO supplier split)
Stage6: Quality Validator (Rule-based sanity checks ‚Üí PASS/FAIL)
Stage7: LLM Validator (ONLY for FAIL rows) ‚Üí correct or confirm FAIL

--------------------------------------------------------------------------------
IMPORTANT FIXES INCLUDED
--------------------------------------------------------------------------------

Supplier Name Fix:
- Supplier is extracted ONLY from strict DoD award patterns:
  ", , is awarded ..."
  ", , has been awarded ..."
  ", , was awarded ..."

- This prevents wrong supplier explosion like:
  "BAE Systems, LET, MEN" => wrongly interpreted as 3 suppliers

- Program Type Fix:
- Program type values must match EXACT allowed set:
  Procurement / Training / MRO/Support / RDT&E / Upgrade / Other Service

 Splitting Fix:
- Supplier-based splitting is REMOVED completely.
- Split logic focuses only on:
  - multi-operator allocations
  - FMS multi-country allocations (only when G2G)

--------------------------------------------------------------------------------
Output:
- One output Excel row per ‚Äúlogical contract event‚Äù
- Evidence + Reason columns are highlighted
- QA validator produces:
  - "QA Status" (PASS/FAIL)
  - "QA Flags"   (human-readable reasons)
  - "QA Fix Suggestion" (what likely needs correction)

================================================================================
"""

# ==============================================================================
# 0) IMPORTS
# ==============================================================================

import os
import re
import json
import pickle
import difflib
import datetime
from typing import Annotated, TypedDict, List, Dict, Any, Optional, Tuple

import pandas as pd
import faiss
from dateutil import parser
from dateutil.relativedelta import relativedelta
import getpass

# LangGraph / LangChain
from langchain_core.messages import AnyMessage
from langchain_core.tools import tool
from langgraph.graph import StateGraph, END, START
from langgraph.graph.message import add_messages
from pydantic import BaseModel, Field
from openai import OpenAI

# Excel formatting
from openpyxl import load_workbook
from openpyxl.styles import PatternFill, Font


# ==============================================================================
# 0.1) DEBUG LOGGING HELPERS
# ==============================================================================

def log_block(title: str, content: str):
    """
    Debug logger that prints clean separated blocks.

    Why this matters:
    - Your pipeline depends on multi-stage LLM and deterministic logic.
    - Debugging extraction errors becomes easier if every stage logs:
        - input paragraph
        - system prompt output
        - intermediate decision flags
        - final values
    """
    print("\n" + "=" * 110)
    print(title)
    print("=" * 110)
    print(content)


# ==============================================================================
# 1) RAG RETRIEVER (FAISS + METADATA)
# ==============================================================================

class SystemKBRetriever:
    """
    FAISS-based semantic retriever to improve SYSTEM classification accuracy.

    What it does:
    - Loads vector DB index from:
        system_kb.faiss
    - Loads metadata from:
        system_kb_meta.pkl

    Why:
    - Your taxonomy-based system classification improves drastically when the LLM
      sees similar ‚Äúknown-good labeled‚Äù examples from your KB.

    Output format (retrieve):
        [
          {"score": float, "meta": {KB columns}},
          ...
        ]
    """

    def __init__(self, kb_dir: str, embed_model: str = "sentence-transformers/all-MiniLM-L6-v2"):
        self.kb_dir = kb_dir
        self.embed_model = embed_model

        index_path = os.path.join(kb_dir, "system_kb.faiss")
        meta_path = os.path.join(kb_dir, "system_kb_meta.pkl")

        if not os.path.exists(index_path) or not os.path.exists(meta_path):
            raise FileNotFoundError(
                f"KB files not found in: {kb_dir}\n"
                f"Expected:\n- {index_path}\n- {meta_path}\n\n"
                f"Build the KB first using your KB builder script."
            )

        print(f"Loading FAISS Index: {index_path}")
        self.index = faiss.read_index(index_path)

        print(f"Loading KB Metadata: {meta_path}")
        with open(meta_path, "rb") as f:
            self.meta = pickle.load(f)

        print(f"KB Loaded rows: {len(self.meta)}")

        self.embedder = None

    def _lazy_load_embedder(self):
        """
        Lazy load embedding model.

        Why:
        - Faster pipeline startup
        - Avoids memory overhead until retrieval is actually needed
        """
        if self.embedder is None:
            from sentence_transformers import SentenceTransformer
            self.embedder = SentenceTransformer(self.embed_model)

    def retrieve(self, query_text: str, top_k: int = 3):
        """
        Retrieve top-k similar KB rows.

        Args:
            query_text: paragraph text to retrieve similar examples
            top_k: number of results

        Returns:
            List[Dict] with {score, meta}
        """
        import numpy as np

        query_text = str(query_text).strip()
        if not query_text:
            return []

        self._lazy_load_embedder()

        q_emb = self.embedder.encode([query_text], normalize_embeddings=True).astype("float32")
        scores, idxs = self.index.search(q_emb, top_k)

        results = []
        for score, idx in zip(scores[0], idxs[0]):
            if idx < 0:
                continue
            results.append({"score": float(score), "meta": self.meta[idx]})

        return results


# ==============================================================================
# 2) CONFIGURATION & FILE PATHS
# ==============================================================================

TAXONOMY_PATH = r"C:\Users\mukeshkr\Agentic-AI-Defense-Data-Extraction\notebook\taxonomy.json"
SUPPLIERS_PATH = r"C:\Users\mukeshkr\Agentic-AI-Defense-Data-Extraction\notebook\suppliers.json"
INPUT_EXCEL_PATH = r"C:\Users\mukeshkr\Agentic-AI-Defense-Data-Extraction\data\source_file.xlsx"
OUTPUT_EXCEL_PATH = "Processed_Defense_Data.xlsx"

RAG_KB_DIR = r"C:\Users\mukeshkr\Agentic-AI-Defense-Data-Extraction\notebook\system_kb_store"


# ==============================================================================
# 2.1) LLM CLIENT SETUP (LLMFOUNDRY)
# ==============================================================================

if "LLMFOUNDRY_TOKEN" not in os.environ:
    os.environ["LLMFOUNDRY_TOKEN"] = getpass.getpass("Enter the LLM Foundry API Key: ")

client = OpenAI(
    api_key=f'{os.environ.get("LLMFOUNDRY_TOKEN")}:my-test-project',
    base_url="https://llmfoundry.straive.com/openai/v1/",
)

retriever = SystemKBRetriever(kb_dir=RAG_KB_DIR)


# ==============================================================================
# 3) LOAD JSON HELPERS
# ==============================================================================

def load_json_file(filename, default_value):
    """
    Safely loads JSON files like:
    - taxonomy.json
    - suppliers.json

    Why:
    - Your pipeline should never crash just because file path breaks.
    - If missing, fallback to default_value.
    """
    try:
        with open(filename, "r", encoding="utf-8") as f:
            print(f"Loaded: {filename}")
            return json.load(f)
    except Exception as e:
        print(f"Warning: Could not load {filename} ({e}). Using default.")
        return default_value


raw_taxonomy = load_json_file(TAXONOMY_PATH, {})
TAXONOMY_STR = json.dumps(raw_taxonomy, separators=(",", ":"))

SUPPLIER_LIST = load_json_file(SUPPLIERS_PATH, [
    "Dell Inc", "Boeing", "Lockheed Martin", "Raytheon Technologies",
    "Northrop Grumman", "L3Harris", "BAE Systems", "General Dynamics"
])


# ==============================================================================
# 4) RULE BOOK + GEOGRAPHY
# ==============================================================================

RULE_BOOK = {
    "defensive_countermeasures": {
        "triggers": ["flare", "chaff", "countermeasure", "decoy", "mju-", "ale-"],
        "guidance": "Market Segment: 'C4ISR Systems', System Type (General): 'Defensive Systems', Specific: 'Defensive Aid Suite'"
    },
    "radars_and_sensors": {
        "triggers": ["radar", "sonar", "sensor", "an/apy", "an/tpy"],
        "guidance": "Market Segment: 'C4ISR Systems', System Type (General): 'Sensors'"
    },
    "ammunition": {
        "triggers": ["cartridge", "round", "projectile", " 5.56", " 7.62", "ammo"],
        "guidance": "Market Segment: 'Weapon Systems', System Type (General): 'Ammunition'"
    }
}

GEOGRAPHY_MAPPING = {
    "North America": ["USA", "United States", "US", "United States of America", "Canada", "America"],
    "Europe": ["UK", "United Kingdom", "Ukraine", "Germany", "France", "Italy", "Spain", "Poland", "Netherlands",
               "Norway", "Sweden", "Finland", "Denmark", "Belgium"],
    "Asia-Pacific": ["Australia", "Japan", "South Korea", "Taiwan", "India", "Singapore", "New Zealand"],
    "Middle East and North Africa": ["Israel", "Saudi Arabia", "UAE", "United Arab Emirates", "Egypt", "Qatar", "Kuwait", "Iraq"],
    "International Organisations": ["NATO", "EU", "IFU", "UN", "NSPA"]
}

# ==============================================================================
# 5) BASE HELPERS (Supplier + Dates + Region + Designators)
# ==============================================================================

PROGRAM_TYPE_ALLOWED = [
    "Procurement",
    "Training",
    "MRO/Support",
    "RDT&E",
    "Upgrade",
    "Other Service"
]

def normalize_program_type(pt: str) -> str:
    """
    Ensures Program Type always matches exact allowed taxonomy.
    """
    if not pt:
        return "Other Service"

    t = str(pt).strip().lower()

    if any(k in t for k in ["mro", "support", "maintenance", "repair", "overhaul", "sustainment", "logistics"]):
        return "MRO/Support"
    if "training" in t:
        return "Training"
    if any(k in t for k in ["rdte", "research", "development", "prototype", "test and evaluation"]):
        return "RDT&E"
    if any(k in t for k in ["upgrade", "modernization", "modification"]):
        return "Upgrade"
    if any(k in t for k in ["procure", "buy", "purchase", "production", "delivery"]):
        return "Procurement"

    return "Other Service"


def get_best_supplier_match(extracted_name: str) -> str:
    """
    FIXED SUPPLIER MATCHING LOGIC (Priority Order):
    
    1. Exact Match: "Boeing" == "Boeing"
    2. Substring Match (Known in Extracted): 
       If valid list has "BAE Systems" and extracted is "BAE Systems - Norfolk Ship Repair",
       we detect "BAE Systems" is INSIDE the extracted text.
       -> Returns "BAE Systems"
    3. Substring Match (Extracted in Known):
       extracted="Raytheon" -> matches "Raytheon Technologies"
    4. Fuzzy Match (Strict): Only high confidence (0.8) to prevent "Ship Repair" matching "Admiralty Ship".
    5. Fallback: Return raw extracted text.
    """
    if not extracted_name or str(extracted_name).strip().lower() in ["unknown", "n/a", "not applicable"]:
        return "Unknown"

    raw_name = str(extracted_name).strip()
    raw_lower = raw_name.lower()
    
    # 0) Prepare List: Sort by length (descending) so we match "General Dynamics" before "General"
    # This ensures we get the most specific match first.
    valid_suppliers = sorted([str(s) for s in SUPPLIER_LIST], key=len, reverse=True)

    # 1) Exact Match
    for s in valid_suppliers:
        if s.lower() == raw_lower:
            return s

    # 2) Reverse Substring: Check if a VALID SUPPLIER exists inside the EXTRACTED text
    # Case: Extracted = "BAE Systems - Norfolk Ship Repair"
    #       Valid List has "BAE Systems"
    #       Match Found!
    for s in valid_suppliers:
        # We ensure 's' is not just a tiny generic word like "Inc" (length check > 3)
        if len(s) > 3 and s.lower() in raw_lower:
            return s

    # 3) Forward Substring: Check if EXTRACTED text exists inside a VALID SUPPLIER
    # Case: Extracted = "Raytheon"
    #       Valid List has "Raytheon Technologies"
    for s in valid_suppliers:
        if raw_lower in s.lower():
            return s

    # 4) Strict Fuzzy Match (High Cutoff 0.8)
    # cutoff=0.8 ensures "Ship Repair" does NOT match "Admiralty Ship" (which usually shares only ~40-50%)
    matches = difflib.get_close_matches(raw_name, valid_suppliers, n=1, cutoff=0.8)
    if matches:
        return matches[0]

    # 5) No Match -> Return the raw extracted text (Better than a wrong guess)
    return raw_name


def extract_awardee_supplier_strict(paragraph: str) -> Tuple[str, str]:
    """
    Extract supplier name STRICTLY using DoD awardee sentence format.
    Then passes the raw extraction to get_best_supplier_match for fuzzy mapping.

    Patterns:
    - ", , is awarded ..."
    - ", , has been awarded ..."
    """
    text = str(paragraph).strip()

    # IMPORTANT: We stop at first comma group before "is awarded/was awarded/has been awarded"
    patterns = [
        r"^([A-Z][A-Za-z0-9&\-\.\s]+?),\s+.*?\s+(?:is|was|has been)\s+awarded\b",
        r"^([A-Z][A-Za-z0-9&\-\.\s]+?),\s+.*?\s+received\s+an?\s+award\b",
    ]

    for pat in patterns:
        m = re.search(pat, text, flags=re.IGNORECASE)
        if m:
            raw_supplier = m.group(1).strip()
            # Standardize using the updated fuzzy logic function
            final_supplier = get_best_supplier_match(raw_supplier)
            return final_supplier, raw_supplier

    return "Unknown", "Not Found"


def calculate_mro_months(start_date_str, end_date_text, program_type):
    """
    Calculates MRO duration (months) ONLY for program_type == "MRO/Support".
    """
    if program_type != "MRO/Support":
        return "Not Applicable"
    try:
        if not start_date_str or not end_date_text:
            return "Not Applicable"

        start = pd.to_datetime(start_date_str, dayfirst=True)
        end = parser.parse(str(end_date_text), fuzzy=True)

        diff = relativedelta(end, start)
        total_months = diff.years * 12 + diff.months
        return str(max(0, int(total_months)))
    except:
        return "Not Applicable"


def get_region_for_country(country_name):
    """
    Maps country -> region using GEOGRAPHY_MAPPING.
    """
    if not country_name or str(country_name).strip().lower() in ["unknown", "n/a", "not applicable"]:
        return "Unknown"

    clean = str(country_name).strip().lower()

    if clean in ["us", "usa", "u.s.", "united states", "united states of america"]:
        return "North America"
    if clean in ["uk", "u.k.", "britain", "great britain"]:
        return "Europe"

    for region, countries in GEOGRAPHY_MAPPING.items():
        if any(c.lower() == clean for c in countries):
            return region

    return "Unknown"


DESIGNATOR_PATTERNS = [
    r"\bDDG[-\s]?\d+\b", r"\bCVN[-\s]?\d+\b", r"\bSSN[-\s]?\d+\b",
    r"\bLCS[-\s]?\d+\b", r"\bLPD[-\s]?\d+\b", r"\bLHA[-\s]?\d+\b", r"\bLHD[-\s]?\d+\b",
    r"\bF-\d+\b", r"\bB-\d+\b", r"\bC-\d+\b", r"\bA-\d+\b",
    r"\bMQ-\d+\b", r"\bRQ-\d+\b",
    r"\bAN\/[A-Z0-9\-]+\b",
    r"\b(AIM|AGM|SM|RIM|MIM)-\d+\b",
]

def extract_designators(text: str):
    """
    Extract common platform/system designators from paragraph.
    """
    text = str(text)
    found = []
    for pat in DESIGNATOR_PATTERNS:
        found.extend(re.findall(pat, text, flags=re.IGNORECASE))
    cleaned = []
    for f in found:
        cleaned.append(f.upper().replace(" ", "").replace("--", "-"))
    final = []
    seen = set()
    for x in cleaned:
        if x not in seen:
            final.append(x)
            seen.add(x)
    return final


def detect_piloting_rule_based(text: str, designators: List[str]) -> str:
    """
    Deterministic piloting classification to reduce LLM mistakes.
    """
    t = str(text).lower()

    if any(d.startswith(("MQ-", "RQ-")) for d in designators):
        return "Uncrewed"
    if any(k in t for k in ["unmanned", "uav", "drone", "autonomous"]):
        return "Uncrewed"

    if any(d.startswith(("DDG", "CVN", "SSN", "LCS", "LPD", "LHA", "LHD")) for d in designators):
        return "Crewed"
    if "uss " in t:
        return "Crewed"

    return "Not Applicable"

# ==============================================================================
# 6) SPLIT ENGINE (UPDATED: Multi-Operator Fallback)
# ==============================================================================

def parse_operator_quantity_allocations(paragraph: str):
    """
    Detect quantity allocations by operator.
    Supported patterns:
      - "212 for the Navy"
      - "187 for the Air Force"
      - "84 for Foreign Military Sales (FMS) customers"
    """
    text = str(paragraph)
    allocations = []

    # Standard "Qty for Branch" pattern
    pattern = r"(\d+)\s+for\s+the\s+(Navy|Air Force|Army|Marine Corps|Space Force|Coast Guard)"
    matches = re.findall(pattern, text, flags=re.IGNORECASE)
    for qty, op in matches:
        allocations.append({"operator": op.title(), "quantity": qty, "g2g_b2g": "B2G"})

    # FMS Pattern
    fms_pattern = r"(\d+)\s+for\s+(?:Foreign Military Sales\s*\s*customers|FMS\s*customers|a\s*FMS\s*customer|FMS)"
    fms_matches = re.findall(fms_pattern, text, flags=re.IGNORECASE)
    for qty in fms_matches:
        allocations.append({"operator": "Foreign Assistance", "quantity": qty, "g2g_b2g": "G2G"})

    # Deduplicate
    unique = []
    seen = set()
    for a in allocations:
        key = (a["operator"], a["quantity"], a["g2g_b2g"])
        if key not in seen:
            unique.append(a)
            seen.add(key)

    return unique


def parse_fms_countries(paragraph: str):
    """
    Extract FMS customer countries list.
    Example: "governments of Australia, Bahrain, Belgium..."
    """
    text = str(paragraph)

    m = re.search(
        r"governments of (.+?)(?:\.\s| Work will be performed| Fiscal| This contract|$)",
        text,
        flags=re.IGNORECASE | re.DOTALL
    )
    if not m:
        return []

    block = m.group(1)
    raw = re.split(r",|\band\b", block)

    countries = []
    for c in raw:
        c = c.strip()
        if 2 < len(c) <= 40:
            countries.append(c)

    final = []
    seen = set()
    for c in countries:
        if c.lower() not in seen:
            final.append(c)
            seen.add(c.lower())

    return final


def parse_multiple_values(paragraph: str):
    """
    Detect multiple values inside paragraph.
    """
    text = str(paragraph)
    money_pattern = r"\$([\d,]+(?:\.\d+)?)"
    vals = re.findall(money_pattern, text)
    vals = list(dict.fromkeys(vals))
    return vals


def split_rows_engine(base_row: dict, paragraph: str):
    """
    MASTER SPLIT ENGINE (Updated for Explicit Operator Split)

    Logic Priorities:
    1. Quantity-based Split: If "5 for Navy, 2 for Army" found -> Split with quantities.
    2. Explicit Operator List: If no quantities found, but 'Customer Operator' column 
       contains commas (e.g. "Navy, Air Force") -> Split into rows with "Not Applicable" quantity.
    3. FMS Split: If G2G, split by FMS countries.
    """
    paragraph = str(paragraph)

    # 1. Parse regex-based allocations (High Precision)
    allocations = parse_operator_quantity_allocations(paragraph)
    
    # 2. Parse FMS countries
    fms_countries = parse_fms_countries(paragraph)
    
    # 3. Check for multi-value notes
    multi_values = parse_multiple_values(paragraph)

    split_reasons = []
    
    # Check 1: Did we find "Qty for Operator"?
    if allocations:
        split_reasons.append("Multi-operator quantity allocation found")
    
    # Check 2: If NOT, did the LLM extract multiple operators in the column?
    # e.g. base_row["Customer Operator"] = "Air Force, Navy"
    raw_operators = str(base_row.get("Customer Operator", "")).split(",")
    clean_operators = [op.strip() for op in raw_operators if op.strip() and op.strip().lower() != "unknown"]
    
    has_operator_list_split = False
    if not allocations and len(clean_operators) > 1:
        has_operator_list_split = True
        split_reasons.append("Multiple Customer Operators detected (Text Split)")

    if fms_countries:
        split_reasons.append("FMS multi-country list found")
    if len(multi_values) >= 2:
        split_reasons.append("Multiple financial values found")

    # --- NO SPLIT CASE ---
    if not split_reasons:
        base_row["Split Flag"] = "No"
        base_row["Split Reason"] = "No split condition found"
        return [base_row]

    # --- EXECUTE SPLITS ---
    rows = [base_row.copy()]
    base_reason = " | ".join(split_reasons)

    # Priority A: Quantity-based Allocation Split
    if allocations:
        new_rows = []
        for r in rows:
            for alloc in allocations:
                rr = r.copy()
                rr["Customer Operator"] = alloc["operator"]
                rr["Quantity"] = alloc["quantity"]
                rr["G2G/B2G"] = alloc["g2g_b2g"]
                rr["Split Flag"] = "Yes"
                rr["Split Reason"] = f"{base_reason} (allocations)"
                new_rows.append(rr)
        rows = new_rows
    
    # Priority B: Explicit Operator List Split (Fallback if no quantities)
    elif has_operator_list_split:
        new_rows = []
        for r in rows:
            for op_name in clean_operators:
                rr = r.copy()
                rr["Customer Operator"] = op_name
                # We don't know the quantity split, so we keep original or mark N/A
                rr["Quantity"] = "Not Applicable" 
                rr["Split Flag"] = "Yes"
                rr["Split Reason"] = f"{base_reason} (operator list)"
                new_rows.append(rr)
        rows = new_rows

    # Priority C: Multi-value annotation (Just adds a note)
    if len(multi_values) >= 2:
        for r in rows:
            note = r.get("Value Note (If Any)", "Not Applicable")
            r["Value Note (If Any)"] = f"{note} | Multiple values detected: {multi_values[:5]}"

    # Priority D: FMS Country Split (Only for G2G rows)
    if fms_countries:
        final_rows = []
        for r in rows:
            # Only split if it's actually a G2G row OR the operator is Generic FMS
            is_g2g = r.get("G2G/B2G") == "G2G"
            is_fms_op = "foreign" in str(r.get("Customer Operator")).lower()
            
            if is_g2g or is_fms_op:
                for c in fms_countries:
                    rr = r.copy()
                    rr["Customer Country"] = c
                    rr["Customer Region"] = get_region_for_country(c)
                    rr["Split Flag"] = "Yes"
                    rr["Split Reason"] = f"{base_reason} (FMS countries)"
                    final_rows.append(rr)
            else:
                final_rows.append(r)
        rows = final_rows

    # Final cleanup of flags
    for r in rows:
        r.setdefault("Split Flag", "Yes")
        r.setdefault("Split Reason", base_reason)

    return rows

# ==============================================================================
# 7) AGENTS / TOOLS
# ==============================================================================

# --------------------------------------------------------------------------
# Stage 1: SOURCING EXTRACTOR
# --------------------------------------------------------------------------

class SourcingInput(BaseModel):
    paragraph: str = Field(description="Full contract paragraph/description text.")
    url: str = Field(description="Source URL of the contract announcement/news.")
    date: str = Field(description="Contract date in Excel (string).")

@tool("sourcing_extractor")
def sourcing_extractor(paragraph: str, url: str, date: str):
    """
    Stage 1: SOURCING EXTRACTOR

    Purpose:
    - Creates the base skeleton row (stable fields).

    Output columns created:
    - Description of Contract
    - Additional Notes (Internal Only)
    - Source Link(s)
    - Contract Date
    - Reported Date (By SGA)

    Important:
    - These fields remain SAME even after splits.
    - Every split row inherits these values.
    """
    reported_date = datetime.datetime.now().strftime("%Y-%m-%d")

    notes = "Standard extraction."
    if "modification" in str(paragraph).lower():
        notes = "Contract Modification."
    if "multiple award" in str(paragraph).lower():
        notes = "Multiple award contract detected (non-supplier split)."

    return {
        "Description of Contract": paragraph,
        "Additional Notes (Internal Only)": notes,
        "Source Link(s)": url,
        "Contract Date": date,
        "Reported Date (By SGA)": reported_date
    }


# --------------------------------------------------------------------------
# Stage 2: GEOGRAPHY EXTRACTOR
# --------------------------------------------------------------------------

class GeographyInput(BaseModel):
    paragraph: str = Field(description="Full contract paragraph/description text.")

@tool("geography_extractor")
def geography_extractor(paragraph: str):
    """
    Stage 2: GEOGRAPHY EXTRACTOR

    Purpose:
    - Extract geo + operator fields:
      - Customer Country
      - Customer Operator
      - Supplier Country

    Derivations:
      - Customer Region
      - Supplier Region
      - Domestic Content (Indigenous vs Imported)

    Notes:
    - Supplier Country is NOT supplier name.
    - Supplier Name is extracted later in Stage4.
    """
    sys_prompt = """
Extract: Customer Country, Supplier Country, Customer Operator.

Strict Rules:
- If the paragraph mentions "Navy", "Air Force", "Army", "Marine Corps"
  and it is in buyer context -> set Customer Operator accordingly.
- If the paragraph is FMS, customer might be "Foreign Military Sales",
  but country list is handled later in split stage.
- Return ONLY JSON.

Return JSON:
{
  "Customer Country": "...",
  "Customer Operator": "...",
  "Supplier Country": "..."
}
"""

    log_block("HUMAN MESSAGE (Stage2 - Geography)", paragraph)

    try:
        completion = client.chat.completions.create(
            model="gpt-4o-mini",
            messages=[{"role": "system", "content": sys_prompt},
                      {"role": "user", "content": paragraph}],
            temperature=0,
            response_format={"type": "json_object"}
        )
        raw = json.loads(completion.choices[0].message.content)
        log_block("AI RESPONSE (Stage2 - Geography)", json.dumps(raw, indent=2))
    except Exception as e:
        raw = {}
        log_block("AI ERROR (Stage2 - Geography)", str(e))

    cust = raw.get("Customer Country", "Unknown")
    supp = raw.get("Supplier Country", "Unknown")

    domestic = "Indigenous" if str(cust).lower() == str(supp).lower() else "Imported"

    return {
        "Customer Region": get_region_for_country(cust),
        "Customer Country": cust,
        "Customer Operator": raw.get("Customer Operator", "Unknown"),
        "Supplier Region": get_region_for_country(supp),
        "Supplier Country": supp,
        "Domestic Content": domestic
    }


# --------------------------------------------------------------------------
# Stage 3: SYSTEM CLASSIFIER (RAG-ENHANCED)
# --------------------------------------------------------------------------

class SystemInput(BaseModel):
    paragraph: str = Field(description="Full contract paragraph/description text.")

@tool("system_classifier")
def system_classifier(paragraph: str):
    """
    Stage 3: SYSTEM CLASSIFIER (RAG-Enhanced)

    Purpose:
    - Determine defense system labels using:
      Taxonomy reference
      Rule book overrides
      RAG KB similar examples
      Deterministic piloting override

    Output:
    - Market Segment
    - System Type (General)
    - System Type (Specific)
    - System Name (General)
    - System Name (Specific)
    - System Piloting
    - Evidence + Reason for each label
    - Confidence

    Strict output rules:
    - Return FLAT JSON object only
    - Each value must be STRING
    - Evidence must be exact copied substring or "Not Found"
    """
    paragraph = str(paragraph).strip()
    if not paragraph:
        return {}

    log_block("HUMAN MESSAGE (Stage3 - System)", paragraph)

    lower_text = paragraph.lower()
    hints = [
        f"RULE: {v['guidance']}"
        for _, v in RULE_BOOK.items()
        if any(t in lower_text for t in v["triggers"])
    ]
    hint_str = "\n".join(hints) if hints else "No special override rules triggered."

    designators = extract_designators(paragraph)
    piloting_rule = detect_piloting_rule_based(paragraph, designators)

    rag_hits = retriever.retrieve(paragraph, top_k=3)
    rag_examples = []
    for hit in rag_hits:
        meta = hit["meta"]
        rag_examples.append({
            "score": round(hit["score"], 4),
            "Market Segment": meta.get("Market Segment", ""),
            "System Type (General)": meta.get("System Type (General)", ""),
            "System Type (Specific)": meta.get("System Type (Specific)", ""),
            "System Name (General)": meta.get("System Name (General)", ""),
            "System Name (Specific)": meta.get("System Name (Specific)", ""),
            "System Piloting": meta.get("System Piloting", ""),
            "Snippet": meta.get("Description of Contract", "")[:220] + "..."
        })

    sys_prompt = f"""
You are a Senior Defense System Classification Analyst.

REFERENCE TAXONOMY:
{TAXONOMY_STR}

RULE BOOK OVERRIDES:
{hint_str}

OUTPUT RULES:
- Return ONLY a FLAT JSON object.
- Every value must be a STRING.
- Do NOT return nested objects or lists.
- Evidence must be copied EXACTLY from paragraph text.
- If evidence not present, output "Not Found".

Return JSON:
{{
  "Market Segment": "",
  "Market Segment Evidence": "",
  "Market Segment Reason": "",

  "System Type (General)": "",
  "System Type (General) Evidence": "",
  "System Type (General) Reason": "",

  "System Type (Specific)": "",
  "System Type (Specific) Evidence": "",
  "System Type (Specific) Reason": "",

  "System Name (General)": "",
  "System Name (General) Evidence": "",
  "System Name (General) Reason": "",

  "System Name (Specific)": "",
  "System Name (Specific) Evidence": "",
  "System Name (Specific) Reason": "",

  "System Piloting": "",
  "System Piloting Evidence": "",
  "System Piloting Reason": "",

  "Confidence": "High/Medium/Low"
}}
"""

    user_prompt = f"""
PARAGRAPH:
{paragraph}

DESIGNATORS (regex extracted):
{designators if designators else "None"}

RULE BASED PILOTING:
{piloting_rule}

RAG EXAMPLES:
{json.dumps(rag_examples, indent=2)}
"""

    try:
        completion = client.chat.completions.create(
            model="gpt-4o-mini",
            messages=[{"role": "system", "content": sys_prompt},
                      {"role": "user", "content": user_prompt}],
            temperature=0,
            response_format={"type": "json_object"}
        )
        result = json.loads(completion.choices[0].message.content)
        log_block("AI RESPONSE (Stage3 - System)", json.dumps(result, indent=2))

        # Hard override piloting rule
        result["System Piloting"] = piloting_rule
        result.setdefault("System Piloting Evidence", "Not Found")
        result.setdefault("System Piloting Reason", "Derived from deterministic piloting rules.")

        # Ensure flat
        for k, v in result.items():
            if isinstance(v, (dict, list)):
                result[k] = str(v)

        return result

    except Exception as e:
        log_block("AI ERROR (Stage3 - System)", str(e))
        return {
            "Market Segment": "",
            "Market Segment Evidence": "Not Found",
            "Market Segment Reason": "",

            "System Type (General)": "",
            "System Type (General) Evidence": "Not Found",
            "System Type (General) Reason": "",

            "System Type (Specific)": "",
            "System Type (Specific) Evidence": "Not Found",
            "System Type (Specific) Reason": "",

            "System Name (General)": "",
            "System Name (General) Evidence": "Not Found",
            "System Name (General) Reason": "",

            "System Name (Specific)": "",
            "System Name (Specific) Evidence": "Not Found",
            "System Name (Specific) Reason": "",

            "System Piloting": piloting_rule,
            "System Piloting Evidence": "Not Found",
            "System Piloting Reason": "Derived from deterministic piloting rules.",

            "Confidence": "Low",
            "Error": str(e)
        }


# --------------------------------------------------------------------------
# Stage 4: CONTRACT EXTRACTOR (SUPPLIER FIXED)
# --------------------------------------------------------------------------

class ContractInfoInput(BaseModel):
    paragraph: str = Field(description="Full contract paragraph/description text.")
    contract_date: str = Field(description="Contract date as string.")

@tool("contract_extractor")
def contract_extractor(paragraph: str, contract_date: str):
    """
    Stage 4: CONTRACT EXTRACTOR (Supplier + Financial + Program details)

    Purpose:
    - Extract:
      Supplier Name  (STRICT award-pattern extraction only)
      Program Type   (must be EXACT allowed set; MRO/Support fixed)
      Quantity
      Value (Million)
      Currency
      Value Certainty
      G2G/B2G
      Completion Date Text

    CRITICAL SUPPLIER FIX:
    - Supplier Name must come from the DoD award format:
      ", , is awarded ..."
    - We DO NOT scan paragraph for multiple suppliers.
    - We DO NOT split on supplier.
    - If strict pattern fails -> Supplier = Unknown

    Output JSON:
    - Flat dictionary with correct normalized fields
    """
    # STRICT supplier extraction first
    supplier_name, supplier_evidence = extract_awardee_supplier_strict(paragraph)

    # Program type + financial extraction by LLM
    system_instruction = """
You are a Defense Contract Financial Analyst.

Your job:
- Extract only factual contract financial + program info.

STRICT PROGRAM TYPE ENUM (must output EXACT string from list):
- Procurement
- Training
- MRO/Support
- RDT&E
- Upgrade
- Other Service

Rules:
1) program_type MUST be one of the allowed strings above.
2) value_certainty must be Confirmed or Estimated
3) quantity must be numeric if possible else "Not Applicable"
4) g2g_b2g:
   - "G2G" ONLY if FMS/Foreign Military Sales is clearly mentioned
   - Otherwise "B2G"
5) completion_date_text: keep raw completion date phrase if exists

Return ONLY JSON object.
"""

    user_prompt = f"""
PARAGRAPH:
{paragraph}

SIGNED DATE:
{contract_date}

Return JSON:
{{
  "program_type": "",
  "value_million_raw": "",
  "currency_code": "",
  "value_certainty": "",
  "quantity": "",
  "completion_date_text": "",
  "g2g_b2g": "",
  "value_note": ""
}}
"""

    log_block("HUMAN MESSAGE (Stage4 - Contract)", paragraph)

    try:
        completion = client.chat.completions.create(
            model="gpt-4o-mini",
            messages=[{"role": "system", "content": system_instruction},
                      {"role": "user", "content": user_prompt}],
            temperature=0,
            response_format={"type": "json_object"}
        )
        raw = json.loads(completion.choices[0].message.content)
        log_block("AI RESPONSE (Stage4 - Contract)", json.dumps(raw, indent=2))
    except Exception as e:
        log_block("AI ERROR (Stage4 - Contract)", str(e))
        raw = {}

    # Normalize Program Type strictly
    pt_raw = raw.get("program_type", "")
    program_type = normalize_program_type(pt_raw)

    mro_months = calculate_mro_months(contract_date, raw.get("completion_date_text"), program_type)

    # Parse value
    try:
        val_str = str(raw.get("value_million_raw", "0")).replace(",", "").replace("$", "")
        val_float = float(val_str)
        val_formatted = "{:.3f}".format(val_float)
    except:
        val_formatted = "0.000"

    # Signing month/year
    try:
        dt = pd.to_datetime(contract_date)
        sign_month = dt.strftime("%B")
        sign_year = str(dt.year)
    except:
        sign_month, sign_year = "Unknown", "Unknown"

    return {
        "Supplier Name": supplier_name,
        "Supplier Name Evidence": supplier_evidence,

        "Program Type": program_type,
        "Expected MRO Contract Duration (Months)": mro_months,

        "Quantity": raw.get("quantity", "Not Applicable"),
        "Value Certainty": raw.get("value_certainty", "Confirmed"),

        "Value (Million)": val_formatted,
        "Currency": raw.get("currency_code", "USD$"),
        "Value (USD$ Million)": val_formatted,

        "Value Note (If Any)": raw.get("value_note", "Not Applicable"),
        "G2G/B2G": raw.get("g2g_b2g", "B2G"),

        "Signing Month": sign_month,
        "Signing Year": sign_year
    }


# --------------------------------------------------------------------------
# Stage 5: SPLITTER AGENT
# --------------------------------------------------------------------------

class SplitterInput(BaseModel):
    paragraph: str = Field(description="Full contract paragraph/description text.")
    base_row: dict = Field(description="Extracted row after Stage1-4.")

@tool("splitter_agent")
def splitter_agent(paragraph: str, base_row: dict):
    """
    Stage 5: SPLITTER AGENT

    Purpose:
    - Applies deterministic split logic to generate multiple output rows
      when paragraph has explicit multi allocations.

    Supported splits:
    Operator/Quantity split ("212 for the Navy", "187 for the Air Force")
    FMS country split (only for rows marked as G2G)
    Multi-value note (does not split, only notes)

    IMPORTANT:
    - Supplier split is REMOVED to prevent wrong supplier explosions.
    """
    try:
        rows = split_rows_engine(base_row, paragraph)
        for r in rows:
            r.setdefault("Split Flag", "No")
            r.setdefault("Split Reason", "")
        return {"rows": rows}
    except Exception as e:
        base_row["Split Flag"] = "Error"
        base_row["Split Reason"] = f"Split failed: {str(e)}"
        return {"rows": [base_row]}


# --------------------------------------------------------------------------
# Stage 6: QUALITY VALIDATOR AGENT (RULE-BASED)
# --------------------------------------------------------------------------

class QAInput(BaseModel):
    paragraph: str = Field(description="Original paragraph for reference and validation.")
    rows: list = Field(description="Final split rows list output from Stage5.")

@tool("quality_validator")
def quality_validator(paragraph: str, rows: list):
    """
    Stage 6: QUALITY VALIDATOR (Rule-Based)

    Purpose:
    - Detect obvious wrong extractions and flag them.
    - This helps reduce garbage rows going to the output Excel.

    Validation Checks (examples):
    1) Supplier Name must NOT be Unknown if strict award pattern exists
    2) Supplier Name must NOT be numeric or short garbage tokens
    3) Program Type must always be one of allowed enum
    4) System Market/System Type should not be empty
    5) If G2G then FMS keyword must exist (soft check)
    6) Value must be numeric >= 0

    Output:
    - Adds columns to each row:
        QA Status = PASS/FAIL
        QA Flags = "..." reasons
        QA Fix Suggestion = what to correct
    """
    text = str(paragraph).lower()

    validated_rows = []
    for r in rows:
        flags = []
        fixes = []

        supplier = str(r.get("Supplier Name", "")).strip()
        supplier_ev = str(r.get("Supplier Name Evidence", "")).strip()
        program_type = str(r.get("Program Type", "")).strip()

        market = str(r.get("Market Segment", "")).strip()
        sys_gen = str(r.get("System Type (General)", "")).strip()

        g2g_b2g = str(r.get("G2G/B2G", "")).strip()

        value_m = str(r.get("Value (Million)", "")).strip()

        # --- Supplier validations
        if supplier.lower() in ["unknown", "n/a", "not applicable", ""]:
            # If paragraph looks like award pattern exists -> supplier must not be unknown
            if re.search(r"\b(is|was|has been)\s+awarded\b", text):
                flags.append("Supplier is Unknown but award pattern exists")
                fixes.append("Re-extract supplier using strict awardee supplier extraction")

        if len(supplier) > 0 and len(supplier) <= 2:
            flags.append("Supplier name too short / garbage")
            fixes.append("Supplier extraction likely wrong; enforce strict pattern")

        if supplier.isdigit():
            flags.append("Supplier name is numeric")
            fixes.append("Supplier extraction corrupted; enforce strict pattern")

        # If supplier evidence exists, it must not contain commas indicating location misuse
        if supplier_ev != "Not Found" and "," in supplier_ev:
            flags.append("Supplier evidence contains commas (may include location)")
            fixes.append("Ensure only supplier name captured before first comma")

        # --- Program type validations
        if program_type not in PROGRAM_TYPE_ALLOWED:
            flags.append("Program Type not in allowed enum")
            fixes.append(f"Normalize Program Type using allowed enum: {PROGRAM_TYPE_ALLOWED}")

        # --- System classification validations
        if not market:
            flags.append("Market Segment empty")
            fixes.append("System classifier must output Market Segment")
        if not sys_gen:
            flags.append("System Type (General) empty")
            fixes.append("System classifier must output System Type (General)")

        # --- G2G soft check
        if g2g_b2g == "G2G" and "fms" not in text and "foreign military sales" not in text:
            flags.append("Row marked G2G but paragraph does not mention FMS")
            fixes.append("Re-evaluate G2G/B2G detection")

        # --- Value validation
        try:
            v = float(value_m)
            if v < 0:
                flags.append("Value (Million) is negative")
                fixes.append("Fix financial value extraction")
        except:
            flags.append("Value (Million) not numeric")
            fixes.append("Fix value parsing and enforce numeric output")

        qa_status = "PASS" if len(flags) == 0 else "FAIL"

        rr = r.copy()
        rr["QA Status"] = qa_status
        rr["QA Flags"] = " | ".join(flags) if flags else "None"
        rr["QA Fix Suggestion"] = " | ".join(fixes) if fixes else "None"

        validated_rows.append(rr)

    return {"rows": validated_rows}


# --------------------------------------------------------------------------
# Stage 7: LLM VALIDATOR (ONLY FOR FAIL ROWS)
# --------------------------------------------------------------------------

class LLMValidateInput(BaseModel):
    paragraph: str = Field(description="Original paragraph text.")
    row: dict = Field(description="One single FAIL row to validate/correct.")

@tool("llm_fail_row_validator")
def llm_fail_row_validator(paragraph: str, row: dict):
    """
    Stage 7: LLM FAIL ROW VALIDATOR (Runs ONLY if QA Status = FAIL)

    Purpose:
    - For FAIL rows, ask LLM to:
      confirm which fields are wrong
      propose corrected values
      keep stable fields unchanged

    This stage is OPTIONAL but very useful because:
    - Rule-based validator detects the mistake
    - LLM can fix the row if fix is obvious

    Strong Rules:
    - Do NOT hallucinate supplier names
    - Supplier must follow award pattern
    - Program Type must match allowed enum

    Output:
    - corrected_row JSON (flat)
    - keep original if correction uncertain
    """
    sys_prompt = f"""
You are a Defense Contract Data Quality Auditor.

You will receive:
1) Original paragraph
2) A structured extracted row marked as FAIL

Your task:
- Fix ONLY fields that are clearly wrong.
- Do NOT invent values.
- Supplier Name MUST come from award pattern:
  ", , is/was/has been awarded"

Program Type MUST be one of:
{PROGRAM_TYPE_ALLOWED}

Return JSON:
{{
  "Supplier Name": "",
  "Program Type": "",
  "G2G/B2G": "",
  "Value (Million)": "",
  "Quantity": "",
  "Fix Summary": ""
}}
"""

    user_prompt = f"""
PARAGRAPH:
{paragraph}

FAIL ROW JSON:
{json.dumps(row, indent=2)}
"""

    try:
        completion = client.chat.completions.create(
            model="gpt-4o-mini",
            messages=[{"role": "system", "content": sys_prompt},
                      {"role": "user", "content": user_prompt}],
            temperature=0,
            response_format={"type": "json_object"}
        )
        fix = json.loads(completion.choices[0].message.content)
        log_block("AI RESPONSE (Stage7 - LLM FAIL FIX)", json.dumps(fix, indent=2))
    except Exception as e:
        log_block("AI ERROR (Stage7 - LLM FAIL FIX)", str(e))
        return {"row": row}

    # Apply corrections carefully
    corrected = row.copy()

    if fix.get("Supplier Name"):
        corrected["Supplier Name"] = get_best_supplier_match(fix["Supplier Name"])

    if fix.get("Program Type"):
        corrected["Program Type"] = normalize_program_type(fix["Program Type"])

    if fix.get("G2G/B2G"):
        corrected["G2G/B2G"] = fix["G2G/B2G"]

    if fix.get("Value (Million)"):
        corrected["Value (Million)"] = fix["Value (Million)"]
        corrected["Value (USD$ Million)"] = fix["Value (Million)"]

    if fix.get("Quantity"):
        corrected["Quantity"] = fix["Quantity"]

    corrected["LLM QA Fix Summary"] = fix.get("Fix Summary", "Not Provided")

    return {"row": corrected}


# ==============================================================================
# 8) LANGGRAPH PIPELINE
# ==============================================================================

class AgentState(TypedDict):
    """
    LangGraph state flowing between nodes.

    Fields:
    - input_text: contract paragraph
    - input_date: signed date
    - input_url: source link
    - final_data: aggregated dict from Stage1-4
    - final_rows: list rows after Stage5 split
    - validated_rows: list rows after Stage6 QA
    - final_rows_post_llm: optional rows after Stage7 fixes
    - messages: LangGraph internal
    """
    input_text: str
    input_date: str
    input_url: str

    final_data: dict
    final_rows: list
    validated_rows: list
    final_rows_post_llm: list

    messages: Annotated[List[AnyMessage], add_messages]


def stage_1_sourcing(state: AgentState):
    """
    Node Stage1:
    - Runs sourcing_extractor
    - Creates base stable metadata row skeleton
    """
    res = sourcing_extractor.invoke({
        "paragraph": state["input_text"],
        "url": state["input_url"],
        "date": state["input_date"],
    })
    new_data = state.get("final_data", {}).copy()
    new_data.update(res)
    return {"final_data": new_data}


def stage_2_geography(state: AgentState):
    """
    Node Stage2:
    - Extract geography + operator fields
    - Adds region mapping + domestic content
    """
    res = geography_extractor.invoke({"paragraph": state["input_text"]})
    new_data = state.get("final_data", {}).copy()
    new_data.update(res)
    return {"final_data": new_data}


def stage_3_system(state: AgentState):
    """
    Node Stage3:
    - Classifies system-level taxonomy labels
    - Uses RAG + evidence + reason + piloting rules
    """
    res = system_classifier.invoke({"paragraph": state["input_text"]})
    new_data = state.get("final_data", {}).copy()
    new_data.update(res)
    return {"final_data": new_data}


def stage_4_contract(state: AgentState):
    """
    Node Stage4:
    - Extracts contract-level fields like:
        Supplier Name (STRICT award pattern extraction)
        Program Type (normalized to allowed enum; MRO/Support fixed)
        Value, Quantity, Currency, G2G/B2G

    IMPORTANT:
    - Supplier Name is NEVER split into multiple suppliers.
    - This fixes your major corruption issue.
    """
    res = contract_extractor.invoke({
        "paragraph": state["input_text"],
        "contract_date": state["input_date"]
    })
    new_data = state.get("final_data", {}).copy()
    new_data.update(res)
    return {"final_data": new_data}


def stage_5_split(state: AgentState):
    """
    Node Stage5:
    - Applies deterministic split logic.
    - Supported splits:
        Operator allocation split
        FMS country split (G2G only)
    - Supplier splitting is completely removed.
    """
    res = splitter_agent.invoke({
        "paragraph": state["input_text"],
        "base_row": state["final_data"]
    })
    return {"final_rows": res.get("rows", [state["final_data"]])}


def stage_6_quality_validator(state: AgentState):
    """
    Node Stage6:
    - Runs rule-based quality validator.
    - Adds QA Status and QA Flags to each output row.

    Output:
    - validated_rows
    """
    res = quality_validator.invoke({
        "paragraph": state["input_text"],
        "rows": state["final_rows"]
    })
    return {"validated_rows": res.get("rows", state["final_rows"])}


def stage_7_llm_fix_fail_rows(state: AgentState):
    """
    Node Stage7:
    - Runs LLM validator ONLY on FAIL rows.
    - PASS rows remain unchanged.

    Why:
    - Optimizes cost + avoids LLM touching correct rows unnecessarily
    """
    paragraph = state["input_text"]
    validated_rows = state.get("validated_rows", [])

    fixed_rows = []
    for r in validated_rows:
        if r.get("QA Status") == "FAIL":
            fix_res = llm_fail_row_validator.invoke({"paragraph": paragraph, "row": r})
            fixed_rows.append(fix_res.get("row", r))
        else:
            fixed_rows.append(r)

    return {"final_rows_post_llm": fixed_rows}


workflow = StateGraph(AgentState)

workflow.add_node("Stage1", stage_1_sourcing)
workflow.add_node("Stage2", stage_2_geography)
workflow.add_node("Stage3", stage_3_system)
workflow.add_node("Stage4", stage_4_contract)
workflow.add_node("Stage5", stage_5_split)
workflow.add_node("Stage6", stage_6_quality_validator)
workflow.add_node("Stage7", stage_7_llm_fix_fail_rows)

workflow.add_edge(START, "Stage1")
workflow.add_edge("Stage1", "Stage2")
workflow.add_edge("Stage2", "Stage3")
workflow.add_edge("Stage3", "Stage4")
workflow.add_edge("Stage4", "Stage5")
workflow.add_edge("Stage5", "Stage6")
workflow.add_edge("Stage6", "Stage7")
workflow.add_edge("Stage7", END)

app = workflow.compile()


# ==============================================================================
# 9) GRAPH EXPORT (OFFLINE SAFE)
# ==============================================================================

def export_workflow_mermaid(app_obj, out_file="workflow.mmd"):
    """
    Exports Mermaid graph as TEXT locally.

    Why:
    - Some machines block online Mermaid rendering
    - This provides offline documentation

    Output:
    - workflow.mmd text file
    """
    mmd = app_obj.get_graph().draw_mermaid()
    with open(out_file, "w", encoding="utf-8") as f:
        f.write(mmd)
    print(f"Workflow Mermaid saved locally: {out_file}")
    return out_file


# ==============================================================================
# 10) EXCEL HIGHLIGHTING FEATURE
# ==============================================================================

def highlight_evidence_reason_columns(excel_path: str):
    """
    Highlights Evidence + Reason columns in output Excel.

    Evidence Columns:
    - Light yellow

    Reason Columns:
    - Light blue

    Purpose:
    - Your team can validate WHY the label was chosen quickly.
    """
    wb = load_workbook(excel_path)
    ws = wb.active

    header = [cell.value for cell in ws[1]]

    evidence_cols = []
    reason_cols = []

    for idx, col_name in enumerate(header, start=1):
        if isinstance(col_name, str) and "Evidence" in col_name:
            evidence_cols.append(idx)
        if isinstance(col_name, str) and "Reason" in col_name:
            reason_cols.append(idx)

    evidence_fill = PatternFill(start_color="FFF2CC", end_color="FFF2CC", fill_type="solid")
    reason_fill = PatternFill(start_color="D9E1F2", end_color="D9E1F2", fill_type="solid")
    header_font = Font(bold=True)

    for col_idx in evidence_cols:
        ws.cell(row=1, column=col_idx).fill = evidence_fill
        ws.cell(row=1, column=col_idx).font = header_font

    for col_idx in reason_cols:
        ws.cell(row=1, column=col_idx).fill = reason_fill
        ws.cell(row=1, column=col_idx).font = header_font

    for row in range(2, ws.max_row + 1):
        for col_idx in evidence_cols:
            ws.cell(row=row, column=col_idx).fill = evidence_fill
        for col_idx in reason_cols:
            ws.cell(row=row, column=col_idx).fill = reason_fill

    wb.save(excel_path)
    print("Evidence + Reason columns highlighted successfully.")


# ==============================================================================
# 11) MAIN EXECUTION
# ==============================================================================

if __name__ == "__main__":

    print(f"\nüìå Loading Input File: {INPUT_EXCEL_PATH}")

    # offline safe workflow graph
    export_workflow_mermaid(app, out_file="workflow.mmd")

    try:
        df_input = pd.read_excel(INPUT_EXCEL_PATH)

        required_cols = ["Source URL", "Contract Date", "Contract Description"]
        if not all(col in df_input.columns for col in required_cols):
            raise ValueError(f"Excel file must contain columns: {required_cols}")

        print(f"üöÄ Processing {len(df_input)} rows...")
        results = []

        for index, row in df_input.iterrows():
            print(f"\nüîπ Row {index + 1}/{len(df_input)}")

            desc = str(row["Contract Description"]) if pd.notna(row["Contract Description"]) else ""
            c_date = str(row["Contract Date"]) if pd.notna(row["Contract Date"]) else str(datetime.date.today())
            c_url = str(row["Source URL"]) if pd.notna(row["Source URL"]) else ""

            initial_state: AgentState = {
                "input_text": desc,
                "input_date": c_date,
                "input_url": c_url,

                "final_data": {},
                "final_rows": [],
                "validated_rows": [],
                "final_rows_post_llm": [],

                "messages": []
            }

            output_state = app.invoke(initial_state)

            rows = output_state.get("final_rows_post_llm", [])
            if not rows:
                rows = output_state.get("validated_rows", [])
            if not rows:
                rows = output_state.get("final_rows", [])
            if not rows:
                rows = [output_state.get("final_data", {})]

            results.extend(rows)

        df_final = pd.DataFrame(results)

        FINAL_COLUMNS = [
            "Customer Region", "Customer Country", "Customer Operator",
            "Supplier Region", "Supplier Country", "Domestic Content",

            "Split Flag", "Split Reason",

            "Market Segment", "Market Segment Evidence", "Market Segment Reason",
            "System Type (General)", "System Type (General) Evidence", "System Type (General) Reason",
            "System Type (Specific)", "System Type (Specific) Evidence", "System Type (Specific) Reason",
            "System Name (General)", "System Name (General) Evidence", "System Name (General) Reason",
            "System Name (Specific)", "System Name (Specific) Evidence", "System Name (Specific) Reason",
            "System Piloting", "System Piloting Evidence", "System Piloting Reason",
            "Confidence",

            "Supplier Name", "Supplier Name Evidence",
            "Program Type", "Expected MRO Contract Duration (Months)",
            "Quantity", "Value Certainty", "Value (Million)", "Currency",
            "Value (USD$ Million)", "Value Note (If Any)", "G2G/B2G",
            "Signing Month", "Signing Year",

            "QA Status", "QA Flags", "QA Fix Suggestion",
            "LLM QA Fix Summary",

            "Description of Contract",
            "Additional Notes (Internal Only)",
            "Source Link(s)",
            "Contract Date",
            "Reported Date (By SGA)"
        ]

        df_final = df_final.reindex(columns=FINAL_COLUMNS, fill_value="")
        df_final.to_excel(OUTPUT_EXCEL_PATH, index=False)

        highlight_evidence_reason_columns(OUTPUT_EXCEL_PATH)

        print("\n Processing Complete!")
        print(f"Output File Saved: {OUTPUT_EXCEL_PATH}")
        print(df_final.head(3).to_string(index=False))

    except Exception as e:
        print(f"\n ERROR: {e}")

Loading FAISS Index: C:\Users\mukeshkr\Agentic-AI-Defense-Data-Extraction\notebook\system_kb_store\system_kb.faiss
Loading KB Metadata: C:\Users\mukeshkr\Agentic-AI-Defense-Data-Extraction\notebook\system_kb_store\system_kb_meta.pkl
KB Loaded rows: 600
Loaded: C:\Users\mukeshkr\Agentic-AI-Defense-Data-Extraction\notebook\taxonomy.json
Loaded: C:\Users\mukeshkr\Agentic-AI-Defense-Data-Extraction\notebook\suppliers.json

üìå Loading Input File: C:\Users\mukeshkr\Agentic-AI-Defense-Data-Extraction\data\source_file.xlsx
Workflow Mermaid saved locally: workflow.mmd
üöÄ Processing 1 rows...

üîπ Row 1/1

HUMAN MESSAGE (Stage2 - Geography)
Northrop Grumman Systems Corp., Aerospace Systems, Melbourne, Florida, is awarded a $12,015,026 modification (P00036) to a previously awarded cost-plus-fixed-fee contract (N0001914C0036). This modification increases the ceiling to extend services and adds hours increasing the full-scale fatigue repair time to achieve the required simulated flight hours i

  from .autonotebook import tqdm as notebook_tqdm



AI RESPONSE (Stage3 - System)
{
  "Market Segment": "Air Platforms",
  "Market Segment Evidence": "Northrop Grumman Systems Corp., Aerospace Systems, Melbourne, Florida, is awarded a $12,015,026 modification (P00036) to a previously awarded cost-plus-fixed-fee contract (N0001914C0036).",
  "Market Segment Reason": "The context involves aerospace systems related to aircraft development.",
  "System Type (General)": "Fixed Wing",
  "System Type (General) Evidence": "This modification increases the ceiling to extend services and adds hours increasing the full-scale fatigue repair time to achieve the required simulated flight hours in support of E-2D Advanced Hawkeye aircraft development.",
  "System Type (General) Reason": "The E-2D Advanced Hawkeye is a fixed-wing aircraft.",
  "System Type (Specific)": "AEW&C",
  "System Type (Specific) Evidence": "in support of E-2D Advanced Hawkeye aircraft development.",
  "System Name (General)": "E-2D Advanced Hawkeye",
  "System Name (General) Ev