In [1]:
import os
import re
import json
import pickle
import pandas as pd
import numpy as np
import faiss
import instructor
import difflib
import datetime
from typing import List, Literal, Optional
from collections import Counter
from pydantic import BaseModel, Field
from openai import OpenAI
from sentence_transformers import SentenceTransformer

# ==============================================================================
# 1. CONFIGURATION & DATA ASSETS
# ==============================================================================

# --- API KEY ---
if "LLMFOUNDRY_TOKEN" not in os.environ:
    os.environ["LLMFOUNDRY_TOKEN"] = input("Enter your LLM Foundry API Key: ")

# --- VERIFIED SUPPLIER LIST (Source of Truth) ---
VERIFIED_SUPPLIERS = [
    "22nd Century Tech", "A&P Group", "AAR Supply Chain Inc.", "Aardvark Clear", "AASKI Technology", "Abacus Tech Corp", 
    "Abdallah Al-Faris", "Abeking Rasmuss", "ABG Shipyards", "Absher Construction Co.", "Abu Dhabi MAR", "Accenture", 
    "Aceinfo Solutions", "Achleitner", "ACMI", "ActioNet", "ADCOM Systems", "Admiralty Ship", "Advanced Navigation", 
    "AECOM", "Aegis Technologies", "Aerojet Rocketdyne", "AeroVironment", "AgEagle", "Agiliti Health", "Airbus", 
    "Airbus-Rheinmetall", "Airborne Tactical Advantage Co.", "Aircell", "Aircraft Readiness Alliance", "Akkodis", 
    "Alion Science", "Allen-Vanguard", "Alliant Techsystems Operations", "Allison Transmission", "Alpha Marine", 
    "Alpine Armoring Inc.", "Altawest", "AM General", "Amazon", "Amentum Services", "American Systems Corp", 
    "Anduril Industries", "Antonov", "Arcfield Canada", "Archer Aviation", "ARES Shipyard", "Argon ST Inc.", 
    "Armoured Car Sys", "ARMSCOR", "Arnold Defense", "Arquus", "ASRC Federal", "AT&T", "Atlas Elektronik", 
    "Austal Limited", "Avibras", "AVIC", "Babcock Group", "BAE Systems", "Ball Corporation", "Bath Iron Works", 
    "Baykar", "Bechtel Group", "Bell Textron", "Bharat Dynamics", "Bharat Electronics", "Black Berry", "BlueHalo", 
    "Boeing", "Booz Allen Hamilton", "CACI", "CAE USA", "Cammell Laird", "Caterpillar", "Cessna", "Chemring", 
    "Cisco Systems", "Cobham", "Collins Aerospace", "Colt's Manufacturing", "Crowley Maritime", "Cubic Corporation", 
    "Curtiss-Wright", "Damen Shipyards", "Dassault", "Day & Zimmerman", "DCS Corp.", "De Havilland", "Dell Inc", 
    "Deloitte", "Denel", "Diehl", "Draken", "DRS Network and Imaging Systems", "DynCorp Int'l", "Dynetics", 
    "Elbit Systems", "Embraer", "Esterline", "Eurofighter", "Fincantieri", "FLIR", "Fluor", "FN Herstal", "Ford", 
    "General Atomics", "General Dynamics", "General Electric", "General Motors", "GKN Aerospace", "Global Military Products", 
    "Gulfstream", "Hanwha", "Harris", "Hensoldt", "Honeywell", "Huntington Ingalls", "Hyundai Heavy Industries", 
    "IBM", "Indra", "Insitu", "Israel Aerospace Industries", "Iveco", "Jacobs", "Johns Hopkins APL", "Kaman", 
    "KBR", "Kongsberg", "Kratos Defense", "L3Harris", "Leidos", "Leonardo", "Lockheed Martin", "Lurssen", 
    "ManTech", "Maxar Technologies", "MBDA", "Mercedes-Benz", "Mercury Systems", "Microsoft", "Mitsubishi Heavy Industries", 
    "Motorola Solutions", "Nammo", "Naval Group", "Navantia", "Navistar", "Nexter", "Northrop Grumman", "Oshkosh", 
    "Palantir", "Parsons", "Patria", "Peraton", "QinetiQ", "Rafael", "Raytheon Technologies (RTX)", "Rheinmetall", 
    "Rocket Lab", "Rockwell Collins", "Rohde & Schwarz", "Rolls-Royce", "Rosoboronexport", "Rostec", "RUAG", 
    "Saab", "Safran", "SAIC", "Samsung", "Serco", "Sierra Nevada Corp", "Sikorsky", "SpaceX", "Spirit Aerosystems", 
    "ST Engineering", "Sukhoi", "Tales", "Tata Advanced Systems", "Teledyne", "Textron", "Thales", "ThyssenKrupp", 
    "Toshiba", "Toyota", "Ultra Electronics", "United Launch Alliance", "Vectrus", "Verizon", "Vigor Industrial", 
    "VSE Corp", "Wartsila", "Westinghouse", "Williams International", "Wyle", "Xerox", "Yakovlev", "Yamaha", 
    "Zodiac Aerospace"
]

# --- MARKET TAXONOMY (Embedded directly from your provided JSON) ---
TAXONOMY_DATA = [
  {"market_segment": "Air Platforms", "system_types_general": [
      {"name": "Fixed Wing", "system_types_specific": ["Fighter", "Transport Aircraft", "Trainers", "Tanker", "Maritime Aircraft", "C4ISR", "Bomber", "Light Combat Aircraft", "Gunship", "ISR-Strike", "AEW&C", "Target Drone"]},
      {"name": "Rotary Wing", "system_types_specific": ["Attack Helicopter", "Transport Helicopter", "Utility Helicopter", "Maritime Helicopter", "Scout Helicopter", "Rotary Wing Attack", "Rotary Wing Maritime", "Rotary Wing Scout", "Rotary Wing Transport"]},
      {"name": "UAV", "system_types_specific": ["MALE UAV", "HALE UAV", "Tactical UAV", "Mini/Micro UAV", "UCAV", "Loitering Munition"]},
      {"name": "Other Lift Types", "system_types_specific": ["Lighter-than-Air", "Spaceplane", "Hybrid", "Airship", "Parafoil"]}
  ]},
  {"market_segment": "Land Platforms", "system_types_general": [
      {"name": "Armoured Fighting Vehicles", "system_types_specific": ["Main Battle Tank", "Infantry Fighting Vehicle", "Armoured Personnel Carrier", "Armoured Reconnaissance Vehicle", "Mine Protected Vehicle", "Light Tank", "Assault Vehicle"]},
      {"name": "Artillery", "system_types_specific": ["Self-Propelled Artillery", "Towed Artillery", "Multiple Rocket Launcher", "Mortar"]},
      {"name": "Tactical Vehicles", "system_types_specific": ["Light Utility Vehicle", "Truck", "All-Terrain Vehicles"]},
      {"name": "Logistics & Support", "system_types_specific": ["Engineering Vehicle", "Recovery Vehicle", "Vehicle Launch Bridge", "Mine Warfare Vehicles", "NBC Vehicles", "Amphibious Assault Vehicle"]},
      {"name": "Unmanned Ground Vehicles", "system_types_specific": ["Combat UGV", "ISR UGV", "Logistics UGV", "EOD UGV"]}
  ]},
  {"market_segment": "Naval Platforms", "system_types_general": [
      {"name": "Surface Combatants", "system_types_specific": ["Aircraft Carrier", "Destroyer", "Frigate", "Corvette", "Cruiser"]},
      {"name": "Sub-Surface", "system_types_specific": ["Ballistic Missile Submarine", "Attack Submarine", "Cruise Missile Submarine", "Midget Submarine", "Diesel-Powered Submarine", "Nuclear-Powered Submarine"]},
      {"name": "Amphibious", "system_types_specific": ["Amphibious Assault Ship", "Landing Platform Dock", "Landing Ship", "Landing Craft"]},
      {"name": "Patrol and Costal Combatants", "system_types_specific": ["Patrol Vessel", "Fast Attack Craft", "Patrol Boat/Craft - Coastal", "Patrol Boat/Craft - Ocean", "Patrol Boat/Craft - Riverine"]},
      {"name": "Auxiliaries", "system_types_specific": ["Replenishment Ship", "Transport Ship", "Hospital Ship", "Icebreakers", "Research/Survey Vessels"]},
      {"name": "Mine Warfare", "system_types_specific": ["Mine Sweeper", "Mine Hunter", "Mine Counter-Measures"]},
      {"name": "Unmanned Maritime Systems", "system_types_specific": ["USV", "UUV"]}
  ]},
  {"market_segment": "Space Systems", "system_types_general": [
      {"name": "Satellite", "system_types_specific": ["Communication Satellite", "ISR Satellite", "Navigation Satellite"]},
      {"name": "Launch Vehicle", "system_types_specific": ["Heavy Lift Launch Vehicle", "Medium Lift Launch Vehicle", "Small Lift Launch Vehicle"]},
      {"name": "Space Ground Segment", "system_types_specific": ["Ground Station"]}
  ]},
  {"market_segment": "C4ISR Systems", "system_types_general": [
      {"name": "Radar", "system_types_specific": ["Air Defense Radar", "Fire Control Radar", "Surveillance Radar", "Air Search Radar", "Surface Surveillance Radar", "Navigation Radar", "Weather Radar"]},
      {"name": "Communications", "system_types_specific": ["Tactical Radio", "Satcom", "Network Equipment", "Data links", "Satellite Communications"]},
      {"name": "Command and Control", "system_types_specific": ["C2 System", "Battle Management System"]},
      {"name": "Electronic Warfare", "system_types_specific": ["Electronic Attack", "Electronic Support", "Electronic Protection"]},
      {"name": "Electro-optic Sensor", "system_types_specific": ["Imaging EO/IR", "Targeting EO/IR"]},
      {"name": "Sonar", "system_types_specific": ["Airborne", "Naval"]},
      {"name": "Cyber", "system_types_specific": ["Cyber Defense/Offense"]}
  ]},
  {"market_segment": "Weapon Systems", "system_types_general": [
      {"name": "Missile", "system_types_specific": ["Air-to-Air Missile", "Air-to-Surface Missile", "Surface-to-Air Missile", "Surface-to-Surface Missile", "Anti-Tank Guided Missile", "Ballistic Missile", "Cruise Missile", "Anti-Ship", "Anti-Submarine"]},
      {"name": "Munition", "system_types_specific": ["Small Arms Ammunition", "Medium Caliber Ammunition", "Large Caliber Ammunition", "Bomb", "Rocket", "Guided Bomb", "Guided Rocket"]},
      {"name": "Weapon", "system_types_specific": ["Small Arm", "Light Weapon", "Cannon"]},
      {"name": "Torpedo", "system_types_specific": ["Lightweight", "Heavyweight"]},
      {"name": "Directed Energy Weapon", "system_types_specific": ["Laser", "Microwave", "Sonic"]}
  ]},
  {"market_segment": "Training & Simulation", "system_types_general": [
      {"name": "Simulators", "system_types_specific": ["Flight Simulator", "Vehicle Simulator", "Maritime Simulator", "Weapon Simulator"]},
      {"name": "Training Aids", "system_types_specific": ["Training Other"]}
  ]},
  {"market_segment": "Infrastructure", "system_types_general": [
      {"name": "Shipyards/Ports/Harbours", "system_types_specific": ["Construction", "Maintenance/Upgrade"]},
      {"name": "Aircraft Basing", "system_types_specific": ["Construction", "Maintenance/Upgrade"]},
      {"name": "Training Facilities", "system_types_specific": ["Construction", "Maintenance/Upgrade"]}
  ]}
]
TAXONOMY_STR = json.dumps(TAXONOMY_DATA, indent=2)

# ==============================================================================
# 2. PROMPTS (UPDATED WITH DOCUMENT RULES)
# ==============================================================================

SPLITTER_PROMPT_TEXT = """
You are a Defense Contract Analyzer. Your GOAL is to identify if the contract requires MULTIPLE database rows.

RULES FOR SPLITTING:
1. **Customer Country Split**: If "Foreign Military Sales (FMS) to Japan, Korea, and Australia", you MUST split into 3 items.
2. **Customer Operator Split**: If "10 for the Navy and 5 for the Air Force", you MUST split into 2 items.
3. **Ukraine Assistance**: If equipment is purchased FOR Ukraine by another country (e.g. USA buys ammo for Ukraine), this is a single row: Customer Country = USA, Operator = "Ukraine (Assistance)". Do NOT split unless multiple donor countries are listed.
4. **NO Split**: If it's a single customer/operator (e.g. "US Navy"), return "requires_split": False.

Return the list of split items with specific Country and Operator for each.
"""

CONTRACT_EXTRACTOR_PROMPT_TEXT = """
You are a Defense Contract Financial Analyst.

TASK: Extract supplier, program type, value, and funding details.

**PROGRAM TYPE DEFINITIONS** (STRICT):
- **Procurement**: Buying NEW hardware, systems, or production units. (Keywords: production, procurement, delivery, manufacture).
- **RDT&E**: Design, testing, prototyping, BEFORE production. (Keywords: development, prototype, research, design).
- **MRO/Support**: Fixing/Sustaining EXISTING systems. (Keywords: maintenance, repair, overhaul, sustainment, logistics support, depot, modernization).
- **Training**: Purchasing training SERVICES (instruction/coaching). NOTE: Buying simulators is "Procurement".
- **Upgrade**: Adding NEW capabilities to existing platforms.
- **Other Service**: Services not covered above.

**SUPPLIER NAME**:
- Extract the company name exactly as written in the awardee section.

**VALUE**:
- Extract the total ceiling or face value in Millions. 
- Value Certainty: "Confirmed" if definite, "Estimated" for IDIQ/Ceilings.

**G2G/B2G**:
- "G2G" ONLY if "Foreign Military Sales" (FMS) is mentioned. Otherwise "B2G".

Return valid JSON.
"""

GEOGRAPHY_PROMPT_TEXT = """
You are a Defense Geography Analyst. 
Extract the Customer Country, Customer Operator, and Supplier Country.

**RULES**:
1. **Customer Country**: 
   - The nation PAYING/RECEIVING. 
   - For FMS, it is the foreign nation (e.g. "FMS to Japan" -> Customer: Japan).
   - For Ukraine Assistance (US buys for Ukraine) -> Customer: USA.
2. **Customer Operator**:
   - Select from: Army, Navy, Air Force (includes Space Force), Defence Wide, Ukraine (Assistance), Foreign Assistance, Other.
   - If US buys for Ukraine -> Operator: "Ukraine (Assistance)".
3. **Supplier Country**:
   - The country where the Supplier Company is based.

Return JSON.
"""

SYSTEM_CLASSIFIER_PROMPT_TEXT = """
You are a Senior Defense System Classification Analyst.
1. **REFERENCE TAXONOMY**: {taxonomy_reference}
2. **TASK**:
   - Classify the system described into **Market Segment**, **System Type (General)**, and **System Name**.
   - **System Name (General)**: The Host Platform or Class (e.g., "F-35 Lightning II").
   - **System Name (Specific)**: The Specific Subject (e.g., "F-35A" or "Logistics Services for F-35").
   - **System Piloting**: Crewed, Uncrewed, Optional, Not Applicable.
"""

# ==============================================================================
# 3. PYDANTIC MODELS
# ==============================================================================

class SplitItem(BaseModel):
    customer_country: str = Field(..., description="The country for this split row (e.g. 'Japan').")
    customer_operator: str = Field(..., description="The operator (e.g. 'Navy', 'Air Force', 'Ukraine (Assistance)').")
    quantity_or_note: str = Field(..., description="Specific quantity or details for this split.")

class SplitterResult(BaseModel):
    reasoning: str = Field(..., description="Why a split is or is not needed.")
    requires_split: bool = Field(..., description="True if multiple rows needed.")
    split_items: List[SplitItem] = Field(default_factory=list)

class GeographyResult(BaseModel):
    customer_country: str
    customer_operator: str
    supplier_country: str

class ContractResult(BaseModel):
    supplier_name_raw: str = Field(..., description="Raw supplier name.")
    program_type: Literal["Procurement", "RDT&E", "MRO/Support", "Training", "Upgrade", "Other Service", "Unknown"]
    value_millions: float
    value_certainty: Literal["Confirmed", "Estimated"]
    currency: str
    g2g_b2g: Literal["G2G", "B2G"]
    mro_duration_months: Optional[int] = Field(None, description="Only for MRO contracts.")

class SystemResult(BaseModel):
    market_segment: str
    system_type_general: str
    system_type_specific: str
    system_name_general: str
    system_name_specific: str
    piloting: Literal["Crewed", "Uncrewed", "Optional", "Not Applicable"]
    confidence: float

# ==============================================================================
# 4. HELPER FUNCTIONS
# ==============================================================================

def clean_supplier_name(raw_name: str) -> str:
    """Finds best match in VERIFIED_SUPPLIERS using fuzzy logic."""
    if not raw_name or raw_name.lower() in ["unknown", "n/a"]:
        return "Unknown"
    
    # 1. Exact match
    if raw_name in VERIFIED_SUPPLIERS:
        return raw_name
        
    # 2. Fuzzy match
    matches = difflib.get_close_matches(raw_name, VERIFIED_SUPPLIERS, n=1, cutoff=0.6)
    if matches:
        return matches[0]
            
    return raw_name # Fallback

def get_domestic_content(cust_country, supp_country):
    """
    Imported: Customer != Supplier
    Indigenous: Customer == Supplier
    """
    if not cust_country or not supp_country: return "Unknown"
    if cust_country.lower() == supp_country.lower():
        return "Indigenous"
    return "Imported"

def get_instructor_client():
    client = OpenAI(
        api_key=f'{os.environ.get("LLMFOUNDRY_TOKEN")}:my-test-project',
        base_url="https://llmfoundry.straive.com/openai/v1/"
    )
    return instructor.from_openai(client, mode=instructor.Mode.JSON)

# ==============================================================================
# 5. CORE PIPELINE LOGIC
# ==============================================================================

def process_single_contract(client, text: str, date_val: str, url_val: str):
    results = []
    
    # --- STEP 1: SPLITTER ---
    split_res = client.chat.completions.create(
        model="gpt-4o-mini",
        response_model=SplitterResult,
        messages=[{"role": "system", "content": SPLITTER_PROMPT_TEXT}, {"role": "user", "content": text}]
    )
    
    # --- STEP 2: COMMON EXTRACTION ---
    contract_res = client.chat.completions.create(
        model="gpt-4o-mini",
        response_model=ContractResult,
        messages=[{"role": "system", "content": CONTRACT_EXTRACTOR_PROMPT_TEXT}, {"role": "user", "content": text}]
    )
    
    final_supplier = clean_supplier_name(contract_res.supplier_name_raw)
    
    system_res = client.chat.completions.create(
        model="gpt-4o-mini",
        response_model=SystemResult,
        messages=[
            {"role": "system", "content": SYSTEM_CLASSIFIER_PROMPT_TEXT.format(taxonomy_reference=TAXONOMY_STR)},
            {"role": "user", "content": text}
        ]
    )

    # Prepare Items
    items_to_process = []
    if split_res.requires_split and split_res.split_items:
        # LOGIC: Equal Value Division if Split
        # Rule: If source doesn't specify distribution, divide equally and mark 'Estimated'
        split_count = len(split_res.split_items)
        split_value = round(contract_res.value_millions / split_count, 3)
        split_certainty = "Estimated" # Per document rule
        
        for item in split_res.split_items:
            items_to_process.append({
                "country": item.customer_country,
                "operator": item.customer_operator,
                "value": split_value,
                "certainty": split_certainty,
                "note": f"Split Item: {item.quantity_or_note}. Value divided equally."
            })
    else:
        # Fallback Geo if no split
        geo_res = client.chat.completions.create(
            model="gpt-4o-mini",
            response_model=GeographyResult,
            messages=[{"role": "system", "content": GEOGRAPHY_PROMPT_TEXT}, {"role": "user", "content": text}]
        )
        items_to_process.append({
            "country": geo_res.customer_country,
            "operator": geo_res.customer_operator,
            "value": contract_res.value_millions,
            "certainty": contract_res.value_certainty,
            "note": "Standard Extraction"
        })

    # --- STEP 3: ASSEMBLE ROWS ---
    for item in items_to_process:
        domestic_status = get_domestic_content(item["country"], geo_res.supplier_country if not split_res.requires_split else "Unknown")
        
        row = {
            "Contract Description": text,
            "Contract Date": date_val,
            "Source Link(s)": url_val,
            
            # Geography
            "Customer Country": item["country"],
            "Customer Operator": item["operator"],
            "Supplier Country": "Unknown" if split_res.requires_split else geo_res.supplier_country, # Simplicity for split
            "Domestic Content": domestic_status,
            
            # Contract
            "Supplier Name": final_supplier,
            "Program Type": contract_res.program_type,
            "Value (Million)": item["value"],
            "Value Certainty": item["certainty"],
            "Currency": contract_res.currency,
            "G2G/B2G": contract_res.g2g_b2g,
            "Value Note": item["note"],
            "Expected MRO Duration": contract_res.mro_duration_months if contract_res.program_type == "MRO/Support" else "Not Applicable",
            
            # System (Taxonomy)
            "Market Segment": system_res.market_segment,
            "System Type (General)": system_res.system_type_general,
            "System Type (Specific)": system_res.system_type_specific,
            "System Name (General)": system_res.system_name_general,
            "System Name (Specific)": system_res.system_name_specific,
            "System Piloting": system_res.piloting
        }
        results.append(row)
        
    return results

# ==============================================================================
# 6. EXECUTION
# ==============================================================================

def run_pipeline(input_path, output_path):
    print(f"Reading {input_path}...")
    df = pd.read_excel(input_path)
    client = get_instructor_client()
    
    all_data = []
    
    for idx, row in df.iterrows():
        print(f"Processing Row {idx+1}/{len(df)}...")
        desc = str(row.get("Contract Description", ""))
        date = str(row.get("Contract Date", ""))
        url = str(row.get("Source URL", ""))
        
        if len(desc) < 10: continue
        
        try:
            rows = process_single_contract(client, desc, date, url)
            all_data.extend(rows)
        except Exception as e:
            print(f"Error row {idx}: {e}")
            
    final_df = pd.DataFrame(all_data)
    final_df.to_csv(output_path, index=False)
    print(f"Done! Saved to {output_path}")

if __name__ == "__main__":
    # Create dummy input if needed
    if not os.path.exists("sample_data.xlsx"):
        print("Creating dummy file for testing...")
        pd.DataFrame({
            "Contract Description": [
                "Raytheon Missile and Defense, Tucson, Arizona, is awarded a $328,156,454 fixed-price incentive (firm target) contract. This contract provides for the production and delivery of Lot 21 as follows: 483 AIM-9X Block II all up round tactical missiles (212 for the Navy, 187 for the Air Force and 84 for Foreign Military Sales (FMS) customers); 82 AIM-9X block II plus all up round missiles (eight for the Navy, eight for the Air Force and 66 for FMS customers); 156 Block II Captive Air Training Missiles (82 for the Air Force and 74 for FMS customers); eight Block II Special Air Training Missiles (two for the Air Force and six for FMS customers); 198 all up round containers (75 for the Navy, 73 for the Air Force and 50 for FMS customers); six spare advanced optical target detectors (two for the Air Force and four for FMS customers); five spare advanced optical target detector containers (two for the Air Force and three for FMS customers); 29 spare Block II guidance units (live battery) (13 for the Navy, four for the Air Force, and 12 for FMS customers); six spare Block II plus guidance units (live battery) for FMS customers; 41 guidance unit containers for FMS customers; 72 spare Captive Air Training Missile guidance units (inert battery) (22 for the Navy, three for the Air Force, and 47 for FMS customers); two spare Block I propulsion steering sections for the Air Force; seven spare Block II propulsion steering sections (two for the Navy, four for the Air Force, and one for FMS customers); 72 spare Block II electronics units (68 for the Navy and four for the Air Force); two classroom explosive ordnance disposal systems trainers for FMS customers; one practical explosive ordnance disposal systems trainer for a FMS customer; 11 multi-purpose training missiles for various FMS customers; 135 tail caps (eight for the Navy, 16 for the Air Force and 111 for FMS customers); 35 tail cap containers (two for the Navy, four for the Air Force, and 29 for FMS customers); one lot of spares assets for the Navy; one lot of spares assets for the Air Force; and one lot of spares assets for the governments of Australia, Bahrain, Belgium, Bulgaria, Canada, Denmark, Finland, Indonesia, Israel, Japan, Kuwait, Malaysia, Morocco, the Netherlands, Norway, Oman, Poland, Qatar, Romania, Saudi Arabia, Singapore, Slovakia, Republic of  Korea, Taiwan, Turkey, and the United Arab Emirates.  Work will be performed in Tucson, Arizona (31%); North Logan, Utah (10%); Keyser, West Virginia (9%); Niles, Illinois (8%); Vancouver, Washington (5%); Ottawa, Ontario, Canada (5%); Goleta, California (4%); Cheshire, Connecticut (4%); Heilbronn, Germany (3%); Simsbury, Connecticut (2%); San Jose, California (2%); Valencia, California (2%); Anaheim, California (2%); Cajon, California (2%); Cincinnati, Ohio (1%); Anniston, Alabama (1%); San Diego, California (1%); Chatsworth, California (1%); Amesbury, Massachusetts (1%); Claremont, California (1%); Sumner, Washington (1%); and various locations within the continental U.S. (4%), and is expected to be completed in June 2024.  Fiscal 2021 weapons procurement (Navy) funds in the amount of $98,204,232; fiscal 2021 missile procurement (Air Force) funds in the amount of $102,681,830; fiscal 2021 research, development, test and evaluation (Air Force) funds in the amount of $802,382; fiscal 2020 missile procurement (Air Force) funds in the amount of $257,638; fiscal 2020 weapons procurement (Navy) funds in the amount of $108,826; fiscal 2019 missile procurement (Air Force) in the amount of $295,576; and FMS funds in the amount of $125,805,970 will be obligated at the time of award, $295,576 of which will expire at the end of the current fiscal year. This contract was not competitively procured pursuant to Federal Acquisition Regulation 6.302-1. The Naval Air Systems Command, Patuxent River, Maryland, is the contracting activity (N0001921C0723).",
                
            ],
            "Contract Date": ["2021-30-06"],
            "Source URL": ["https://www.defense.gov/News/Contracts/Contract/Article/2678227/"]
        }).to_excel("sample_data.xlsx", index=False)
        
    run_pipeline("sample_data.xlsx", "Verified_Output.csv")

  from .autonotebook import tqdm as notebook_tqdm


Creating dummy file for testing...
Reading sample_data.xlsx...
Processing Row 1/1...
Done! Saved to Verified_Output.csv


In [2]:
import os
import re
import json
import pickle
import pandas as pd
import numpy as np
import faiss
import instructor
import difflib
import datetime
from typing import List, Literal, Optional
from collections import Counter
from dateutil import parser
from pydantic import BaseModel, Field
from openai import OpenAI
from sentence_transformers import SentenceTransformer

# ==============================================================================
# 1. CONFIGURATION & DATA ASSETS
# ==============================================================================

if "LLMFOUNDRY_TOKEN" not in os.environ:
    os.environ["LLMFOUNDRY_TOKEN"] = input("Enter your LLM Foundry API Key: ")

# --- VERIFIED SUPPLIER LIST ---
VERIFIED_SUPPLIERS = [
    "22nd Century Tech", "A&P Group", "A&R Pacific -Garney Federal", "AAR Supply Chain Inc.", "Aardvark Clear", "AASKI Technology", "AAVCO", "Abacus Tech Corp", "Abdallah Al-Faris", "Abeking Rasmuss",
    "ABG Shipyards", "ABM Shipyard", "Absher Construction Co.", "Abu Dhabi MAR", "Abu Dhabi SB", "ACC Construction Co.", "Accenture", "Accurate Energetic Systems", "ACE Technology", "Aceinfo Solutions",
    "ACHILE Consortium", "Achleitner", "ACMI", "ACT-Corp", "ActioNet", "ADCOM Systems", "ADI Group", "Admiralty Ship", "Advanced Navigation and Positioning Corp.", "Advanced Technology International",
    "AdvElect Co (AEC)", "AECOM", "Aegis Technologies", "Aeraccess", "Aero Def Systems", "Aero Synergie", "Aero Vodochody", "Aerodata AG", "Aerodyca", "Aerojet Rocketdyne", "Aeromaritime Grp", "Aeromot",
    "Aeronautical Development Establishment", "Aeronautics Defense Systems", "Aerospace Corp", "Aerostar", "Aerostar S.A.", "Aerotree", "AeroVironment", "AeroVolga", "Affigent",
    "Africa Automotive Distribution Service", "Agat", "AgEagle", "Agiliti Health", "AICI-Archirodon JV", "AIDC", "AIM Defence", "Air Center Helicopters", "Air Tractor", "Airbus", "Airbus-Rheinmetall",
    "Airborne Tactical Advantage Co.", "Aircell", "Aircraft Readiness Alliance", "AirRobot", "AIS Engineering", "Akkodis", "Albadeey", "Albatross Industria Aeronautica Ltd.", "Alcatel-Lucent", "Alcock Ashdown",
    "Alexandria Ship", "Alion Science", "Allen-Vanguard", "Alliant Techsystems Operations", "Allison Transmission", "Alpha Marine", "Alpine Armoring Inc.", "Altawest", "Altec Industries", "ALTECH Services",
    "Aleut Federal", "Alzchem Trostberg", "AM General", "Amazon", "Amentum Services", "American International Contractors", "American States Utilities Services", "American SysCorp", "American Systems Corp",
    "AMESYS", "AMI Industries", "AMO ZIL", "Amper Group", "AMSL Aero", "AMTEC Corp.", "AMX International", "Amyx Inc", "AMZ-Kutno", "Anadolu Shipyard", "Analytic Services", "Ananda Shipyard", "Andrea Systems",
    "Andritz Hydro Corp.", "Anduril Industries", "Antonov", "ANVL", "AOI", "Apogee Engeineering", "Applied Mechanics", "Applied Technology", "Applied Visual Technology", "APS", "Aquacopters", "Aquila Aerospace",
    "Arab Contractors", "Arcfield Canada", "Archer Aviation", "Archer Western", "Arcturus UAV", "ARES Shipyard", "Aresa Shipyard", "ARGE K-130", "ARGE NNbS Consortium", "Argon ST Inc.", "ARGE DiNa 155", "ARIS",
    "Arma", "Armenian Air Force Institute", "American Electronics Warfare Associates", "American Ordnance", "Armour International", "Armoured Car Sys", "ARMSCOR", "Armtec Defense Products",
    "Arnold Defense and Elec", "ARO SA", "Arotech Corp", "Arquus", "Array Information", "Arrow Edge LLC", "AR-SAT", "Arsenal d' Marinha", "Arsenal JSCO", "ARTEC", "ARTEL, Inc", "ASC Pty Ltd",
    "Ascent Flight Training Consotium", "Ascom Group", "Aselsan", "ASENAV", "Asian ArmoredVeh", "ASIMAR", "ASISGUARD", "Ashot Ashkelon", "ASL Group", "Aslemetals Oy", "ASMAR", "ASRC Federal", "ASRY",
    "Assurance Tech", "Assured Information Security Inc.", "Aster Engineering", "Astilleros Armon Vigo SA", "Astilleros Navales", "ASTIMAR", "Astronics Test Systems", "ASTRUM", "AT&T", "A-techSYN",
    "Atheeb Integraph Saudi Co.", "Atlas", "Atlas Elektronik", "Atlas Group", "ATR", "August Schell Enterprises", "Aurora Flight Sciences", "Austal Limited", "Australian Target Systems", "Autoespar SA",
    "Automotive Ind Ltd", "AUVERLAND", "Aviation Repair Technologies", "Aviation Systems Engineering", "Aviation Training Consulting", "Avibras", "AVIC", "AVNL", "Avtech Corporation", "Avtokraz Holding Co",
    "AWEIL", "AWSR Shipping", "B&F", "Babcock Group", "BAE Systems", "Ball Corporation", "Baltic Workboats", "BAMS", "Bangkok Dock", "Barrett Comm", "Basler", "Bason Shipyard", "Bath Iron Works", "Battelle",
    "Baud Telecom Co", "Baykar", "Bechtel Group", "Becker Avionics", "Beechcraft", "Beherman Demoen", "Beijing JeepCorp", "Bell", "Bell Boeing", "Bell Textron", "Bellanca", "BEML-India", "Bender Shipbldg",
    "Bergen Group", "Beriev", "BGI-ASI JV", "Bharat Dynamics", "Bharat Elec Ltd", "Bharat Heavy Electricals", "Bharat Sanchar", "Bharati Shipyard", "Bigelow Family Holdings", "Bin Jabr Group", "Bird Aerosystems",
    "Birdon", "Bittium", "BL Halbert International", "Black Box Corp", "Black Micro Corp", "Black River Systems", "Blackberry", "Blackned", "BlackSky", "BlindermanPower", "Blue Air Training", "Blue Ivy Partners",
    "Blue River Consortium", "Blue Tech Inc", "Bluebird", "BlueHalo", "Boeing", "Boelwerf Shipyard", "Bollinger Shipyard", "Bombardier", "Booz Allen Hamilton", "Boresight", "Boustead DCNS JV",
    "Boustead Holding", "Bowhead", "Brahmos Ltd", "BrainGu LLC", "Bridgestone Aircraft Tire Inc.", "Britten-Norman", "Brodosplit Shipyard", "Brooke Marine", "Bryan 77 Construction", "BSVT", "BSVT-NT",
    "BT Group", "BUAA", "BwFuhrparkService", "BWI", "By Light", "CACI", "CAE USA", "Cairns Slipways", "Calian", "Calidus", "Cambridge Intl Systems", "Cammell Laird SB", "Canadair", "Cantieri Navali",
    "Carahsoft Inc.", "Cardama", "Carnegie Mellon University", "CASC", "CASIC", "C-Astral Aerospace", "C-AT", "Caterpillar", "CATIC", "CDO Technologies", "CDW Corporation", "CEA Tech Pty Ltd", "Celier Aviation",
    "CENTECH GROUP", "Cerbair", "Cessna", "CFM International", "CGI", "Chaiseri Metal & Rubber", "Changhe", "Chantier Davie Ship", "Charles Stark Draper", "Chas Kurz", "Chemring", "Chengdu",
    "Chowgule and Company", "Chrysler Group", "Chugach Technical Solutions", "Chung Shan Inst", "Cianbro", "CINAR", "CIO", "Cirrus Aircraft", "Cisco Systems", "Clark Construction Group",
    "CM de N (France)", "CNF Technologies", "CNIM", "CNN Navigation", "Coastal Defense Inc.", "Cobham", "Cochin Shipyard", "CODALTEC", "Codan", "Cohort plc", "Cohu Inc", "Colby Co. LLC",
    "Cole Engineering Services", "Collins Aerospace", "COLSA", "Colt's Manufacturing Co.", "Columbia Helicopters Inc.", "COM DEV International", "Comlenia", "Commander Aircraft Corporation",
    "Commtact", "Computacentre", "CompQsoft", "Computer World Services", "COMSOFT", "ComtechTelecomm", "Conco Inc.", "Conlog Group", "Conoship Intl", "Conquest USA", "Consigli Construction",
    "Consortium Management Group", "Conti Federal Services", "Continental Maritime", "Core Tech International", "Core4ce", "Corporacion De La Industria Aeronautica Colombiana", "Corvid Technologies",
    "COTECMAR", "CounterTrade", "CoVant Technologies", "CPMIEC", "Credence Mgmnt Sol", "Creotech", "Crew Training International", "CRIST", "Criterion Solutions", "CRL Technologies", "Crowley Maritime",
    "CRSA", "Crystal", "CSBC Corp., Taiwan", "CSC", "Cubic Corporation", "Cukurova Holding", "Cummins Inc.", "Curt Nyberg", "Curtiss-Wright", "Cybaero AB", "Dae Sun Shipbldg", "Daewoo", "Daher", "Daimler AG",
    "Dakota Creek", "Dalnyaya Radio", "Damen Shipyards", "DAMEX Shipbldg", "Danbury Mission Technologies", "Danish Maritime", "Danish Yacht", "Danyard Aalborg", "Darkhive", "DARPA", "Dassault", "Dassault Dornier",
    "Data Link Solutions", "Data Sys Analysts", "Datamir", "DataPath", "Day & Zimmerman Lone Star", "Dayton T. Brown Inc.", "DCCA", "DCD-DORBYL", "DCI", "DCNS Odebrecht", "DCS Corp.", "De Havilland Canada",
    "Dearsan Shipyard", "Decisive Analytics", "Deep Trekker", "Defense Ind Org", "Defense Industries Organization Of Iran", "Defense Solutions", "Defense Technology Institute", "Defenture", "Deftools",
    "Delaware Nation Industries Emerging Technologies", "Dell Inc", "Deloitte", "Denel", "Derecktor Shipyard", "DESA", "Design West Technologies", "Destini Berhad", "Detyens Shipyard", "DEW Ltd", "DFDS Group",
    "Diamond Aircraft", "DIANCA", "DIDEP", "Diehl", "Digital Angel Corp", "Digital Management", "Diligent Consulting", "Divelink Underwater", "Diversified Tech Svcs", "DJI", "DKW Communications", "DOF ASA",
    "Domo Tactical Communications", "Doosan Group", "Dornier", "Draken", "Draper Labs", "DRB-HICOM", "DRDC Canada", "DRDO", "Drew Marine USA", "DRS Network and Imaging Systems", "DSD Laboratories", "DSG",
    "DSN Corp", "DSTA", "Ducommun Inc.", "DXC Technology", "Dynamic Systems", "Dynamics Resrch", "Dynamit Nobel", "DynCorp Int'l", "Dynetics Technical Solutions", "Eastern Shipbuilding", "ECAN", "ECRN",
    "ECS Federal", "Emcube Inc", "Edgar Brothers", "Edge Autonomy", "EDGE Group", "Edison Chouest", "EFR Ltd", "eGlobalTech", "EID S.A.", "Eire Forge and Steel", "EINSA", "ELAC Sonar GmbH", "Elbit Systems",
    "ELBO", "Elebra", "Electra", "Electric Boat Corp.", "Electro Optic Systems", "Elettronica SpA", "ELINC", "EllisDon", "EM Solutions", "Embraer", "EMESEC", "EMGEPRON", "Emit Aviation", "EMPL Austria", "EMPORDEF",
    "EMS Tech", "EMT", "EMW", "ENAER", "EnerSys Energy Products", "Engility Corporation", "Engine Eng Oman", "ENICS", "Ensign Bickford", "Enstrom Helicopter", "Entrol", "Environics Oy",
    "Environmental Chemical Corp", "Envision Technology", "Envisioneering Inc.", "EONIC", "EOS", "EPC2 Consortium", "EPE", "EPIIC Consortium", "Eprius", "EPS Corporation", "Epsilon Systems", "ESSI/SEI",
    "Esterline", "Eurofighter", "EuroMIDS", "EUROPAAMS", "Eurosam", "Euroshop SA", "EuroSpike", "Exail", "Excellus Solutions", "Exeter Group", "Extra", "FABREQUIPA", "Fabryka Broni", "FAdeA",
    "Fairchild", "Famae", "FAME SAC", "Fasharkan Ship", "Fassmer", "FAW Group", "FCN Technology Sol", "FEDITC", "Federal Contracting", "FedStore Corp", "FemmeCompInc", "Fenix Air Inc.", "FFA", "FFA Emmen",
    "Firestorm Labs", "Five Rivers Analytics", "Fiat Group", "Fiat-Leonardo", "Flatter Inc.", "Fidelity Technologies", "Fincantieri", "Fischer Panda", "Flensburger Fahrzeugbau", "Flensburger SB",
    "Flight Technologies", "Flightcell Intl", "FLightSafety", "FLIR", "Fluor Marine Propulsion", "Flyer", "Flying Legend", "FMA", "FN Herstal", "FNSS", "Fokker", "Force 3", "Ford Motor Co.",
    "Forum Energy Technologies", "FREIRE Shipyard", "Frequentis GmbH", "Fresia SPA", "Frontgrade Technologies", "Frontier Electronic Systems", "FSC Lublin Auto", "FSUE Neptune", "Fujitsu",
    "Furuno Electric", "G & F Technology", "G1 Aviation", "Gabler Maschine", "GAF", "Game Composites", "Garco Construction Inc.", "Garden Reach SB", "Garmin", "Gate Elektronik", "GC Rieber Shipping",
    "GDELS-Mowag", "GECI", "Gemelli", "Genasys", "GenCorp", "General Atomics Aeronautical Systems", "General Dynamics", "General Electric", "General Motors", "Generic Supplier", "Geneset Powerplants",
    "Georgia Tech", "German Naval Yards", "GESPI", "GFE", "Gibbs & Cox Inc.", "GIDS", "Gilbane Federal", "GKN Aerospace", "Gladding-Hearn", "Global GndSpt LLC", "Global Military Products", "Global Services LLC",
    "Global Tech Res", "Global Technical Sys", "Globecomm", "GMV Aerospace and Defense", "Goa Shipyard Ltd", "Golcuk", "Goodrich Corp.", "Goodyear Tire and Rubber Co.", "Grabba", "Granite-Obayashi",
    "Granta Autonomy", "Grevicom SAC", "Griffon Corporation", "Grob", "GRYFIA", "GTRI", "Guimbal", "Guizhou", "Gulf Island Marine Fabricators LLC", "Gulfstream", "Guyco Inc.", "GZAS", "H2O Guam JV", "Hadean", 
    "Hai Minh Corporation", "Haivision Systems Inc.", "HAL", "Hanjin Indust'l SB", "Hanwha", "Harbin", "Harland & Wolff", "Harper Construction", "Harris", "Harwar International Aviation Technology", "Hatehof", 
    "Hawaiian Rock Products Corp.", "HB Utveckling AB", "HAVELSAN", "HDT Expeditionary Systems", "Head/Diaz 2022", "Heavy Ind. Taxila", "Heckler & Koch", "Helibras", "Helicentro Peru SAC", "Hellenic Aerospace Industries", 
    "Hellfire LLC", "Hensel Phelps Construction", "Hensoldt", "HESA", "HexagonComposite", "Hinduja Group", "Hindustan Ship", "Hi-Q Engineering", "Hisdesat SA", "Hitachi", "Hitachi Kokusai", "Hitzler Werft", "HKV", "Hodges Transportation", 
    "Honeywell", "Hong Ha Shipbuilding", "Hong Leong Group", "Hongdu", "Horizon Technologies", "Hornbeck Offshore Operators", "Howe and Howe", "HP", "HPI Solutions", "HTX Labs", "Hughes Comm", "Humbert Aviation", "Huneed Tech", "Huntington Ingalls", 
    "Huta Stalowa Wola", "HV Joint Venture", "Hydra Technologies", "Hydrema", "Hyundai", "Hyundai J Comm", "IAI", "IAP Worldwide Svc", "IAR", "IBM", "ICF", "Icom Inc.", "ICOMM Tele Ltd.", "IdeaForge", "iGOV", "IHI", "II-VI Aerospace and Defense", 
    "Ilyushin", "ImagineOneT&M", "IMBEL", "IMC Group", "IMCO", "immixGroup", "IMMSI SPA", "IMPSA", "Imtech Marine", "INACE", "Indonesian Aerospace", "Indra", "Indrasoft", "INDUS Technology", "InDyne", "InfoReliance Corp", "Infotron", "Inmarsat", "Innocon", 
    "Innnovaero", "Insitu", "Insta ILS", "Institute of International Education", "INTA", "Integ Surv Tech", "Integral Consulting Services", "Integral Systems", "Integrated Convoy", "Integrated Defense Solutions/Greit", "Integrated Dynamic", "Integrated Dynamics", 
    "Integrated Surveillance and Defense", "Integration Innovation", "Intelligent Decisions", "Intelligent Waves", "INTELSAT", "Inter-Coastal Electronics", "InterCaribbean Airways", "Intermarine", "International Business Machines Corp.", "Intl Shipholding Corp", 
    "Intman SA", "Intracom SA", "INVAP", "Invariant Corp.", "INVISIO", "IOMAX", "IPS Inc", "Iridium Satellite", "Iron Bow Tech", "Irving Shipbldg", "Israel Military Industries", "Israel Shipyards", "ISRO Internal", "Istanbul Shipyard", "Isuzu Motor Co", "Italcantieri", 
    "Italtel", "Italthai Marine", "ITG", "ITI Limited", "ITP Aero", "ITT", "Iveco Defence Vehicles", "Iveco-Oto Melara Consortium", "IVEMA", "IWI", "IXBlue", "Izhmash Unmanned Systems", "Jacobs Eng Group", "Jacobs/B&V JV", "James Fisher", "Jankel", "Japan Marine United", 
    "Japan Steel Works", "Javelin JV Team", "JCB", "Jelcz-Komponenty", "Jet Tekno", "JetZero", "JF Taylor", "JHU/APL", "Joby Aviation", "Johns Hopkins University", "Jong Shyn Ship", "JRC Group", "JSC Almaz-Antey", "JSC Kurganmashzavod", "JSC Tactical Missiles Corp", 
    "Junghans Microtec", "Jupiter Wagons Ltd.", "KADDB", "Kader", "KAI", "Kaman", "KAMAZ", "Kamov", "Kangnam Corp", "Karachi Shipyard (KSEW)", "Katmai Management Services", "Katmerciler", "KATO Engineering", "Kawasaki", "Kay and Associates", "Kazakhstan Eng", 
    "RTX", "Nan Inc","R&M Government Services", "Kazan", "KBM Kolumna", "KBP Instrument", "KBR", "Kearfott Corp", "Keppel Corp", "Kerametal", "Kership", "Khan ResLabs", "Kharkiv Morozov", "Khulna Shipyard", "Kiewit-Alberici SIOP MACC", "King ICT", "King Technologies", "KIRINTEC", "KNDS", "Knight Sky", "Knights Armament Co.", "Koam Engineering", "Koc Group", "KomatsuIndustries", "Kongsberg", "KONSTRUKTA", "Kord Technologies", "Korea Shipbuilding & Offshore Engineering", "Korte Construction", "Agency for Defense Development", "Korean Air Aerospace Division", "KRAS - India", "Krasmashzavod", "Kratos Defense", "Kronshtadt Group", "Krauss-Maffei Wegmann", "Kryukov Car Bldg", "KT Consulting", "KVH Industries", "Kyndryl Finland", "L3 Technologies", "Lancair", "Landmarc", "Lane Construction Corp.", "Larsen & Toubro", "Leidos", "Leonardo", "LET", "Level 3 Comm", "Life Cycle Engineering", "LG Group", "LIG Nex1 Co", "LinQuest Corp", "LinTech Pragmatics JV", "Lite Comms LLC", "Lockheed Martin", "Loc Performance Products", "LOM PRAHA", "Long Wave Inc", "Longbow LLC", "Loral", "Lumen", "Lumenier", "Lung Teh Shipbldg", "Lurssen Group", "Lutch", "Lutsk", "M Ship Co", "M1 Support Services", "M2 Technologies", "M7 Aerospace", "Mach Industry Grp", "Mack Defense", "Mackay Comm", "MAESTRAL", "Maestranza AMSU", "MAG Aerospace", "Magellan Aerospace Corporation", "Mahindra", "MA Mortenson", "MAN", "Manhattan Construction", "ManTech", "Mapiex Aviation", "Marine Alutech Oy", "Marine Hydraulics", "Marine United", "MarineTec", "Marinette Marine Corp.", "MARS Shipyards", "Marsh Aviation", "Marshall Aerospace", "MARSS", "Marsun Company", "Martifer Group", "Marvin Land System", "Mastodon Design", "Mathtech", "Maule Air", "MAV", "Maxar Technologies", "Mazagon Dock", "MBB", "MBDA", "McCrone Associates", "McDermott Marine", "McLean Contracting", "MD Helicopters", "MDA Space", "MDT Armour", "MechDB S Africa", "Mectron", "MEDAV GmbH", "Mercedes-Benz", "Mercer Engineering Research Centre", "Mercury Systems Inc.", "Merlin Labs Inc.", "Merwede", "Mesko", "MESIT holding", "Messer Construction", "Metal Shark", "MetalCraft Marine", "Metalnor SA", "Meyer Werft", "Michelin", "Micro Aviation", "Microdis Electronics", "Micropol Fiberoptic AB", "Microsoft", "MicroTech", "Middle East Def", "Mikal Group", "Mikoyan", "Mil", "MilDef", "Milenium Veladi Corp.", "Millenium Space", "MilSOFT Software", "MineWolf Systems", "MISC Berhad", "Mission1st", "Mistral Inc.", "Mitie", "Mitsubishi", "Mitsui SB", "MKEK", "MMIST", "MNDI Pacific JV", "MO Porte-Avions", "Modern Technology Solutions", "Moller-Maersk", "Moog Inc.", "MorseCorp Inc.", "Morye Shipyard", "Motorola Solutions", "MSI", "Mudry", "Mugin", "MVL USA", "MW Builders", "Mythics", "Nakilat", "Nakupuna Consulting", "NAMC", "Nammo", "Nan Inc.", "Nanchang", "National Academy of Sciences of Belarus", "National Steel and Shipbuilding", "Natl Radio Telecom", "Nautica Nova", "Naval Gijon Ship", "Naval Group", "Naval Shipyard Gdynia", "Navantia", "Naviris", "Navistar International", "Navmar", "NCI Info Sys", "NCSIST", "ND Defense", "NDMA", "NEC", "Neiva", "Neorion Group", "NES Associates", "NetCentrics Corp", "Netline Comm", "New Directions Technologies", "NewSpace India", "NEWTEC", "Nexter", "NGV Tech", "NH Industries", "NICCO Comm", "Nigerian Dockyard", "NII STT", "Niigata Shipbuilding", "NIMR Auto", "Nissan", "nLIGHT Nutronics", "Noble Supply and Logistics", "Noblis MSD", "Nokia", "Nordic Terrain Solutions", "Norinco", "Norma Precision AB", "Nortel", "North Sea Boats", "Northrop Grumman", "Northstar Aviation", "Nostromo", "Novadem", "NP Aerospace", "NPO Elektro'ka", "NPO Lavochkin", "NRL", "NSSL", "NSWC", "NT Service", "NTConcepts", "nTSI", "NTT Group", "NUBURU", "Nurol Co.", "NVL Group", "Oakwell Engineer", "OBRUM", "OCEA Group", "Ocean Shipholdings", "Ocean Tech Sys", "Oceaneering", "OCR Global", "Odebrecht Group", "Odyssey Systems Consulting Group", "OGMA", "OHB System AG", "OIP Land Systems", "Old North Utility Services", "Olin Winchester", "Omnisec AG", "Omnisys", "Ondas", "Optics1 Inc.", "Optima Government Solutions", "Orbit Technologies", "Orizzonte Sistemi Navali", "Orskov Group", "Oshkosh", "OSI", "Otobus Karoseri", "Otokar", "OTT Technologies", "Out of Business", "Overaasen AS", "Ovzon", "PAC", "Paccar", "Pacific Aerospace", "Pacific Rim Constructors", "Pacifics Propeller International", "PAE Aviation and Technical Services", "Pakistan Aeronautical Complex", "Palantir Technologies", "Palantir USG", "Palfinger", "PAMA-SP", "PanAmSat", "Panavia", "Panha", "Paramount Group", "Parker-Hannifin", "Parrot", "Parsons Government Services", "Patria", "Patriot Contract Svcs", "PCCI", "PCM", "Pearson Engineering", "Peerless Technologies", "Pelatron", "Pelegrin", "Penman Company", "Peraton Technology Services", "Persistent Systems", "Peterson Bldrs", "PGSUS", "PGZ", "PGZ-PILICA Consortium", "PGZ-NAREW Consortium", "Phacil Inc", "Phoenix Air Group", "Philadelphia Yard", "Philippine Telephone", "Piaggio", "Pilatus", "Pindad", "Piper", "Pipistrel", "Piriou Naval Svcs", "PJ Aviation", "PKL Services", "Plath", "PN Dockyard", "Polaris Industries", "Polish Defence Holding", "Polska Grupa", "Polskie Zaklady Lotnicze", "Poly Technologies", "Polyot", "Polysentry", "Pragmatics", "Presidio", "Priboy", "PRIMA Research", "Proforce Defence", "Programs Management Analytics and Technologies", "Propmech Corp", "Prox Dynamics", "PS Engineering", "PSI", "PSM", "PT Batam", "Pt Bhinneka Dwi Persada", "PT Citra Barahi Shipyard", "PT Daya Radar Utama", "PT Dirgantara", "PT Dumas Shipyard", "PT Kodja Bahari", "PT PAL Indonesia", "PT Palindo", "PT Republik Defensindo", "PZL-Mielec", "PZL-Okęcie", "PZL-Swidnik", "Q-Techn LLC", "Qbase, LLC.", "QED Systems Inc", "QinetiQ", "Qioptiq", "Qods Aviation Industries", "Quad City Aircraft", "QualX Corp.", "Quantum Research", "Quantum Systems", "QuantX Labs", "Qwest", "R&W Contractors", "Radiance Tech", "Radmor SA", "Rafael", "RAM Systems", "RAMET", "Range Generation Next", "Rannoch Corp", "Rauma Marine", "Ravenswood Solutions Inc.", "RAVN Group", "Raytheon Technologies", "RC Construction", "Rebellion Defense", "ReconCraft LLC", "Record Steel & Construction", "Red Peak Technical Services", "Red River Computer", "Redflex Group", "Redwire", "Regional One", "Reims-Cessna", "Reiser", "Reliance Defence", "Reliance Test and Technology", "Remdiesel", "Remontowa Group", "Remoy Shipping", "Renk America", "Repkon USA-Defense", "Reshetnev Company", "Ressenig", "Reunert", "Revolution Aviation", "Rh-Alan", "RHEA Group", "Rheinmetall", "Rheinmetall BAE Systems Land", "Rheinmetall Denel", "Rheinmetall MAN", "Ribcraft USA", "Ricardo PLC", "Riga Shipyard", "RIO Design Bureau", "Rio Santiago Shipyard", "Rise8 Inc.", "RiverHawk Group", "Robertson Fuel Systems", "Robin Radar Systems", "Robinson", "Roboteam", "Rocket Lab National Security LLC", "Rockwell Collins", "Rodman Group", "Rohde & Schwarz", "Roke", "Roketsan", "Rolls-Royce plc", "Roman Brasov", "ROMARM", "Rosomak", "Rostec", "Rothe Development", "Rovsing A/S", "RQ Construction", "RS-UAS", "RTX", "RUAG", "RV Connex", "RWG Repair & Overhauls USA", "Saab", "Sabiex Group", "Sabre Systems", "Sabreliner Corporation", "SAFAT", "Safe Boats Intl", "Safran", "Sagemcom", "SAIC", "Sako", "SAL", "Salient CRGT Inc", "Sallyport Global Holdings", "SAN", "San Yang", "Sandia Nat Labs", "Sanmina-SCI", "Sanska", "Santana Motors", "Santierul Naval", "SANUKI Shipbldg", "SAPURA", "Sapura Thales", "Sarco Defense", "Sasebo Heavy Ind", "Satuma", "Savox Communications", "SBIC", "Scandinavian Avionics", "Scania", "Scheepvaart KB", "Schiebel", "Schweizer", "Schutt Industries", "Science and Engineering Services", "Science Applications International", "Scientia Global", "Scientific Research Corp.", "Scorpene JV", "SCOTTY Group", "SCR", "SEA", "Seabird Aviation", "Sealift Inc", "Seaspan Marine", "Seaward Marine Services", "Second-Hand", "Sectra Comm Sys", "SecuriGence", "Sedef Shipbuilding", "Seed Innovations", "Seemann Composites LLC", "Sefine Shipyard", "Segue Technologies", "Selah Shipbuilding", "SELEX Elsag", "Selex ES", "SEMAN Peru", "SEPECAT", "SEPI", "Sepura", "Serbian State", "Serco Group plc", "SES", "SETEL/REMSCO", "SGJV", "Shaanxi", "Shaanxi Auto Grp", "Shenyang", "Shijiazhuang", "Shin Maywa Industries", "Shin Yang", "Shoft Shipyard", "Short Brothers", "SI Systems Technologies", "SICC", "Sielman S.A.", "Siemens", "Sierra Nevada", "SIG Sauer", "Sigen Consortium", "Sikorsky", "Silent Sentinel", "Silver Ships Inc.", "SIMA Peru", "Singapore Tech.", "SingTel Group", "SISDEF", "Sistemprom", "Sisu Auto", "SITAB Consortium", "SK Holdings", "Skanska", "SkyAlyne", "Skydio", "Slingsby", "SmartShooter", "Smartronix Inc", "SMS Data Products", "SNC-Lavalin", "SNVI", "Sobeca", "Soby Vaerft", "Socarenam", "Socata", "Sodexo Management Inc.", "SOFIS-TRG", "SOFRAME", "Sojitz Corporation", "Soko", "Solar Industries", "Solers", "Solstad Offshore", "SONAK", "Sonalysts Inc.", "Songthu Corporation", "Southern African Ship", "Southern Maryland Electric Cooperative", "Southern Resc'h", "Southwest Resc'h", "Soviet Tank Plant", "Sozvezdie JSC", "SPA", "Spaceflux", "SpaceX", "Spanish Missile Systems", "Sparton De Leon Springs LLC", "Special Technology Ctr", "Spectra", "Spectrum Comm", "SpearUAV", "SpeedCast", "Sprint", "SR Telecom", "SRC", "SRCTec", "ST Aerospace", "StandardAero Inc.", "Stark Aerospace", "Stauder Technologies", "Sterling Computers", "Steyr", "STG", "Stinger ProjectGP", "STM Group", "Streit Group", "STS International", "STX Corporation", "Subaru", "Submarine Manufacturing and Products", "Sukhoi", "Sumaria Systems", "Sumidagawa Ship", "Sumitomo", "Summit Aviation", "Sunair", "Supacat", "Superior Govt Sol", "Superior Marine Ways", "Surrey Satellite Technology", "Survey Copter", "Suzuki Motor Corp", "SVI Engineering", "Swan Hunter", "Swecon", "Swede Ship", "SwedishSpace Cp", "Swiftships SB LLC", "Symetrics", "Synectic Group", "Sypaq Systems", "Sypris Solutions", "Sys for Def/GVS", "System Studies & Simulation", "Systematic", "Systems Planning and Analysis", "T. Mariotti", "Tactical Air Support Inc.", "Tactical Engineer", "TADANO", "TAE Aerospace", "TAI", "Talbert Manufacturing Inc.", "Target Technologia", "Taskizak Shipyard", "TAT Technologies", "Tata Advanced Systems", "Tata Group", "TATRA", "Taurus Systems", "Taylor Defense Products", "TCG", "TCIL", "TDW GmbH", "TDX International", "Technica Corp", "Technical Comms", "Technology Unlim", "Tecnam", "TECNOBIT", "Tekever", "Telecomm Systems", "Teledyne", "Teledyne FLIR", "Telephonics Corp.", "Telespazio", "Teletronics Technology", "Telia Finland", "Tellumat", "Telos Corp", "Telstra", "Terberg Group", "Terma A/S", "Tesat Spacecom", "TESCO INDOMARITIM", "TESLA", "TESS Defence", "Tesseract Ventures", "TETRAEDR", "Texas A&M", "Textron", "Thales", "Thales Alenia Sp", "ThalesRaytheon", "The MIL Corp.", "The Whiting-Turner Contracting Co.", "TWPG", "THEON International", "ThirdEye", "Thoma-Sea Ship", "Thrane & Thrane", "Threod Systems", "Thuraya", "ThyssenKrupp AG", "Timken Gears & Services", "Titan Aircraft", "TKC Global Solutions", "TNO", "Tobyhanna Army", "Tomahawk Robotics", "Top Aces", "Toshiba", "Toyota Motor Corp", "Trans-Ce Cargo SA", "Transall", "Transas Group", "Transbit", "Transfield Services", "TRAX International", "TrellisWare Tech", "Trideum Corp", "Triman Industries", "Triton Group Hold", "TRU Simulation Plus Training", "TRX System", "TSS Solutions", "TTC TELEKOM", "TUBITAK", "Tupolev", "Turkish AFF", "Turner Construction", "Twin Commander Aircraft", "TYBRIN", "Tyco Intl", "Tyovene", "Tyto Athene", "Tyvak International", "UAV Communications", "UAV Solutions", "Uavision Aeronautics", "UCOCAR", "Uconsystem", "UK Docks Marine Services", "Ukraine Weapons", "UkraineTank Plant", "Ukroboronprom", "Ukrspecsystems", "Ulijanovsk", "UltiSat", "Ultra Dimensions Pvt. Ltd.", "Ultra Electronics", "Ultra I&C", "Ultra Maritime", "UMM", "Umoe Group", "Unicom", "Unicom Government", "Unknown", "Unified Industries", "UNIMO Technology", "Unimor Radiocom", "Unisys", "Unit Co.", "United Crane and Excavation Inc.", "United Electronics", "United Launch Alliance", "Univ of Texas", "Univ of Toronto", "Universal Shipbldg", "Unman'dDynamics", "Ural Works Civil Aviation", "Uralvagonzavod", "URC Systems", "UROVESA", "US Marine Inc", "US Ordnance", "USCG YARD", "UTVA", "UVision Global Aero", "Valero Marketing and Supply", "Valiant Global Defense", "Valkyrie Aero", "Van's Aircraft", "Multiple", "Vector Scientific", "Vector Solutions", "Vectrus Systems Corp.", "Vega Company", "Vencore", "Veritas Capital", "Verizon", "Vertex Aerospace", "Vertex Standard", "Vestel", "ViaSat Inc", "Victory Solutions Inc.", "VideoRay LLC", "Viettel Group", "Vigor Industrial", "Viking Air", "Viking Arms", "Vimpel", "Vladimir Radio", "Volkswagen Group", "Volvo Group", "Von Wolf", "VOP 025", "VOP 026 Sternberk", "VPK", "VSE Corp.", "Vulcanair", "V2X", "Walsh Federal LLC", "Wartsila", "Watterson Construction Co.", "WB Electronics", "WBA Blindajes Alemanes", "West Sea Shipyard", "Weststar Group", "WG Yates and Sons", "Wildflower Intl", "Windmill Intl", "World Wide Tech", "WULCO Inc.", "WZE", "WZM", "X-Bow", "Xian", "Xian ASN Technical Group", "XTAR", "Yakovlev", "Yamaha", "Yaroslavl Radio", "Yeonhab Precision", "Yokohama Yacht", "Yoland Corp.", "Yonca-Onuk", "Yugoimport-SDPR", "Zala", "Zamil Offshore", "Zen Technologies", "Zenair LTD", "Zenit Shipyard", "Zenith", "Zlin", "Zwijnenburg", "ZyXEL Comm", "Hydroid Inc", "West Coast JV,", "University of Dayton Research Institute", "Saguaro Business Solutions LLC", "Learjet", "General Dynamics Electric Boat", "Ball Aerospace & Technologies", "TCOM", "Raytheon Missiles and Defense", "Lockheed Martin Missile and Fire Control", "EFW", "Amherst Systems", "Lockheed Martin Sippican", "Hamilton Sundstrand", "Northrop Grumman Aerospace", "R.A. Burch Construction", "Lockheed Martin – Rotary and Mission Systems", "Trace Systems", "Northrop Grumman Space Systems Sector", "L-3 Communications Integrated Systems", "Flint Electric Membership", "Gray Analytics", "Lockheed Martin Aeronautics", "Lockheed Martin Space", "LTM Inc", "Alberici-Mortenson", "Atlantic Signal", "Haight Bey & Associates", "Container Research Corp", "Essex Electro Engineers", "TechFlow Mission Support", "Chugach Range and Facilities Services", "Raytheon Space and Airborne Systems", "Innovative Scientific Solutions", "Delavan", "Covalus", "Chromalloy Component Services", "Armorworks Enterprises", "Metro Machine", "Alloy Surfaces", "Valley Tech Systems", "Keysight Technologies", "Azure Summit Technology", "Isometrics", "Stratascorp", "Synergy Electric Company", "Custom Manufacturing & Engineering", "East West Industries", "MPR Associates", "ARCTOS Technology Solutions", "Enlighten IT Consulting", "Barrett Firearms", "Ametek Programmable Power", "Applied Physical Sciences", "SupplyCore", "Federal Resources", "General Atomics", "Penguin Computing", "Mancon", "Integrated Marine Services", "Compass Systems", "DRS Sustainment Systems", "IronMountain Solutions", "Ball Aerospace & Technologies", "Yulista Services", "SyQwest", "Advanced Technology Systems", "Cleveland Construction", "Canadian Commercial Corp", "Systima Technologies", "Ocean Ships", "Metro Machine Corp", "ImSAR LLC", "Systems Application & Technologies", "Twin Disc", "Konecranes Nuclear Equipment and Services", "Progeny Systems", "WEBCO", "REEL COH", "Waterman Transport", "Western Metal Supply", "Security Signals", "Wolverine Tube", "BC Customs LLC", "TLD America", "Crane Technologies Group", "IDSC Holdings", "AAR Manufacturing", "B & D Electric", "Vector CSP LLC", "Accurate Machine & Tool Corp", "Mississippi State University", "Stephenson Stellar Corp", "Earthly Dynamics", "Woolpert Inc", "Halter Marine", "Marion Manufacturing", "FN America LLC", "CDM Constructors", "Florida State University - Center for Advanced Power Systems", "International Marine & Industrial Applicators", "Zodiac-Poettker HBZ JV II LLC", "DigiFlight", "Globe Composite Solutions", "Meggitt Polymers and Composites", "Martin-Baker Aircraft", "United Kingdom Ministry of Defence", "Ultimate Training Munitions", "PAS Technologies", "DCM Clean Air Products", "Management Services", "Technology Service Corp", "General Electric Aviation", "ACME/RHB", "Howell Industries", "Airdyne Aerospace", "Dominion Energy", "Bionetics", "Choctaw Defense Manufacturing", "Centauri", "DRS Naval Power Systems", "Sentry View Systems", "ERAPSCO", "AAR Government Services", "Management Services", "L3 Doss Aviation", "AgustaWestland Philadelphia", "Marvin Engineering", "Collins Elbit Vision Systems", "FGS", "Navistar Defense LLC", "Voith Hydro", "Delfasco"
]

# --- REGION MAPPING (Simplified for Demo) ---
REGION_MAP = {
    "Sub-Saharan Africa": [
        "Angola", "Benin", "Botswana", "Burkina Faso", "Burundi", "Cameroon", "Cape Verde",
        "Central African Republic", "Chad", "Congo, Democratic Republic of", "Congo, Republic of",
        "Djibouti", "Equatorial Guinea", "Eritrea", "Eswatini", "Ethiopia", "Gabon", "Gambia",
        "Ghana", "Guinea", "Guinea-Bissau", "Ivory Coast", "Kenya", "Lesotho", "Liberia",
        "Madagascar", "Malawi", "Mali", "Mauritius", "Mozambique", "Namibia", "Niger",
        "Nigeria", "Rwanda", "Senegal", "Seychelles", "Sierra Leone", "Somalia", "South Africa",
        "South Sudan", "Sudan", "Tanzania", "Togo", "Uganda", "Zambia", "Zimbabwe"
    ],
    "Asia-Pacific": [
        "Australia", "Brunei", "Cambodia", "China", "Hong Kong", "Indonesia", "Japan", "Laos",
        "Malaysia", "Mongolia", "Myanmar", "New Zealand", "North Korea", "Papua New Guinea",
        "Philippines", "Singapore", "South Korea", "Taiwan", "Thailand", "Vietnam"
    ],
    "Europe": [
        "Albania", "Austria", "Belgium", "Bosnia and Herzegovina", "Bulgaria", "Croatia", "Cyprus",
        "Czech Republic", "Denmark", "Estonia", "Finland", "France", "Georgia", "Germany", "Greece",
        "Hungary", "Iceland", "Ireland", "Italy", "Kosovo", "Latvia", "Lithuania", "Luxembourg",
        "Malta", "Montenegro", "Netherlands", "North Macedonia", "Norway", "Poland", "Portugal",
        "Romania", "Serbia", "Slovakia", "Slovenia", "Spain", "Sweden", "Switzerland", "Turkey",
        "Ukraine", "United Kingdom"
    ],
    "Latin America": [
        "Argentina", "Bahamas", "Barbados", "Belize", "Bolivia", "Brazil", "Chile", "Colombia",
        "Costa Rica", "Cuba", "Curacao", "Dominican Republic", "Ecuador", "El Salvador", "Guatemala",
        "Guyana", "Haiti", "Honduras", "Jamaica", "Mexico", "Nicaragua", "Panama", "Paraguay",
        "Peru", "Suriname", "Trinidad and Tobago", "Uruguay", "Venezuela"
    ],
    "Middle East and North Africa": [
        "Algeria", "Bahrain", "Egypt", "Iran", "Iraq", "Israel", "Jordan", "Kuwait", "Lebanon",
        "Libya", "Mauritania", "Morocco", "Oman", "Qatar", "Saudi Arabia", "Syria", "Tunisia",
        "United Arab Emirates", "Yemen"
    ],
    "North America": ["Canada", "USA"],
    "Russia & CIS": [
        "Armenia", "Azerbaijan", "Belarus", "Kazakhstan", "Kyrgyzstan", "Moldova", "Russia",
        "Tajikistan", "Turkmenistan", "Uzbekistan"
    ],
    "South Asia": [
        "Afghanistan", "Bangladesh", "India", "Maldives", "Nepal", "Pakistan", "Sri Lanka"
    ],
    "Unknown": [
        "Andorra", "Antigua and Barbuda", "Bhutan", "Comoros", "Dominica", "Federated States of Micronesia",
        "Fiji", "Grenada", "Kiribati", "Liechtenstein", "Marshall Islands", "Monaco", "Nauru", "Palau",
        "Palestine", "Puerto Rico", "Saint Kitts and Nevis", "Saint Lucia", "Saint Vincent and the Grenadines",
        "Samoa", "San Marino", "Sao Tom and Principe", "Solomon Islands", "Timor-Leste", "Tonga", "Tuvalu",
        "Unknown", "Vanuatu", "Vatican City", "Western Sahara"
    ]
}

# --- MARKET TAXONOMY ---
TAXONOMY_DATA = [
  {"market_segment": "Air Platforms", "system_types_general": [
      {"name": "Fixed Wing", "system_types_specific": ["Fighter", "Transport Aircraft", "Trainers", "Tanker", "Maritime Aircraft", "C4ISR", "Bomber", "Light Combat Aircraft", "Gunship", "ISR-Strike", "AEW&C", "Target Drone"]},
      {"name": "Rotary Wing", "system_types_specific": ["Attack Helicopter", "Transport Helicopter", "Utility Helicopter", "Maritime Helicopter", "Scout Helicopter", "Rotary Wing Attack", "Rotary Wing Maritime", "Rotary Wing Scout", "Rotary Wing Transport"]},
      {"name": "UAV", "system_types_specific": ["MALE UAV", "HALE UAV", "Tactical UAV", "Mini/Micro UAV", "UCAV", "Loitering Munition"]},
      {"name": "Other Lift Types", "system_types_specific": ["Lighter-than-Air", "Spaceplane", "Hybrid", "Airship", "Parafoil"]}
  ]},
  {"market_segment": "Land Platforms", "system_types_general": [
      {"name": "Armoured Fighting Vehicles", "system_types_specific": ["Main Battle Tank", "Infantry Fighting Vehicle", "Armoured Personnel Carrier", "Armoured Reconnaissance Vehicle", "Mine Protected Vehicle", "Light Tank", "Assault Vehicle"]},
      {"name": "Artillery", "system_types_specific": ["Self-Propelled Artillery", "Towed Artillery", "Multiple Rocket Launcher", "Mortar"]},
      {"name": "Tactical Vehicles", "system_types_specific": ["Light Utility Vehicle", "Truck", "All-Terrain Vehicles"]},
      {"name": "Logistics & Support", "system_types_specific": ["Engineering Vehicle", "Recovery Vehicle", "Vehicle Launch Bridge", "Mine Warfare Vehicles", "NBC Vehicles", "Amphibious Assault Vehicle"]},
      {"name": "Unmanned Ground Vehicles", "system_types_specific": ["Combat UGV", "ISR UGV", "Logistics UGV", "EOD UGV"]}
  ]},
  {"market_segment": "Naval Platforms", "system_types_general": [
      {"name": "Surface Combatants", "system_types_specific": ["Aircraft Carrier", "Destroyer", "Frigate", "Corvette", "Cruiser"]},
      {"name": "Sub-Surface", "system_types_specific": ["Ballistic Missile Submarine", "Attack Submarine", "Cruise Missile Submarine", "Midget Submarine", "Diesel-Powered Submarine", "Nuclear-Powered Submarine"]},
      {"name": "Amphibious", "system_types_specific": ["Amphibious Assault Ship", "Landing Platform Dock", "Landing Ship", "Landing Craft"]},
      {"name": "Patrol and Costal Combatants", "system_types_specific": ["Patrol Vessel", "Fast Attack Craft", "Patrol Boat/Craft - Coastal", "Patrol Boat/Craft - Ocean", "Patrol Boat/Craft - Riverine"]},
      {"name": "Auxiliaries", "system_types_specific": ["Replenishment Ship", "Transport Ship", "Hospital Ship", "Icebreakers", "Research/Survey Vessels"]},
      {"name": "Mine Warfare", "system_types_specific": ["Mine Sweeper", "Mine Hunter", "Mine Counter-Measures"]},
      {"name": "Unmanned Maritime Systems", "system_types_specific": ["USV", "UUV"]}
  ]},
  {"market_segment": "Space Systems", "system_types_general": [
      {"name": "Satellite", "system_types_specific": ["Communication Satellite", "ISR Satellite", "Navigation Satellite"]},
      {"name": "Launch Vehicle", "system_types_specific": ["Heavy Lift Launch Vehicle", "Medium Lift Launch Vehicle", "Small Lift Launch Vehicle"]},
      {"name": "Space Ground Segment", "system_types_specific": ["Ground Station"]}
  ]},
  {"market_segment": "C4ISR Systems", "system_types_general": [
      {"name": "Radar", "system_types_specific": ["Air Defense Radar", "Fire Control Radar", "Surveillance Radar", "Air Search Radar", "Surface Surveillance Radar", "Navigation Radar", "Weather Radar"]},
      {"name": "Communications", "system_types_specific": ["Tactical Radio", "Satcom", "Network Equipment", "Data links", "Satellite Communications"]},
      {"name": "Command and Control", "system_types_specific": ["C2 System", "Battle Management System"]},
      {"name": "Electronic Warfare", "system_types_specific": ["Electronic Attack", "Electronic Support", "Electronic Protection"]},
      {"name": "Electro-optic Sensor", "system_types_specific": ["Imaging EO/IR", "Targeting EO/IR"]},
      {"name": "Sonar", "system_types_specific": ["Airborne", "Naval"]},
      {"name": "Cyber", "system_types_specific": ["Cyber Defense/Offense"]}
  ]},
  {"market_segment": "Weapon Systems", "system_types_general": [
      {"name": "Missile", "system_types_specific": ["Air-to-Air Missile", "Air-to-Surface Missile", "Surface-to-Air Missile", "Surface-to-Surface Missile", "Anti-Tank Guided Missile", "Ballistic Missile", "Cruise Missile", "Anti-Ship", "Anti-Submarine"]},
      {"name": "Munition", "system_types_specific": ["Small Arms Ammunition", "Medium Caliber Ammunition", "Large Caliber Ammunition", "Bomb", "Rocket", "Guided Bomb", "Guided Rocket"]},
      {"name": "Weapon", "system_types_specific": ["Small Arm", "Light Weapon", "Cannon"]},
      {"name": "Torpedo", "system_types_specific": ["Lightweight", "Heavyweight"]},
      {"name": "Directed Energy Weapon", "system_types_specific": ["Laser", "Microwave", "Sonic"]}
  ]},
  {"market_segment": "Training & Simulation", "system_types_general": [
      {"name": "Simulators", "system_types_specific": ["Flight Simulator", "Vehicle Simulator", "Maritime Simulator", "Weapon Simulator"]},
      {"name": "Training Aids", "system_types_specific": ["Training Other"]}
  ]},
  {"market_segment": "Infrastructure", "system_types_general": [
      {"name": "Shipyards/Ports/Harbours", "system_types_specific": ["Construction", "Maintenance/Upgrade"]},
      {"name": "Aircraft Basing", "system_types_specific": ["Construction", "Maintenance/Upgrade"]},
      {"name": "Training Facilities", "system_types_specific": ["Construction", "Maintenance/Upgrade"]}
  ]}
]
TAXONOMY_STR = json.dumps(TAXONOMY_DATA, indent=2)

# ==============================================================================
# 2. PROMPTS
# ==============================================================================

SPLITTER_PROMPT_TEXT = """
You are a Defense Contract Analyzer. Your GOAL is to identify if the contract requires MULTIPLE database rows.

RULES FOR SPLITTING:
1. **Customer Country Split**: If "Foreign Military Sales (FMS) to Japan, Korea, and Australia", you MUST split into 3 items.
2. **Customer Operator Split**: If "10 for the Navy and 5 for the Air Force", you MUST split into 2 items.
3. **Ukraine Assistance**: If equipment is purchased FOR Ukraine by another country (e.g. USA buys ammo for Ukraine), this is a single row: Customer Country = USA, Operator = "Ukraine (Assistance)". Do NOT split unless multiple donor countries are listed.
4. **NO Split**: If it's a single customer/operator (e.g. "US Navy"), return "requires_split": False.

**VALUE DISTRIBUTION**:
- If specific amounts are mentioned for each split (e.g. "$20M for Navy, $10M for Army"), EXTRACT those values into the `specific_value_millions` field.
- If no breakdown is given, leave `specific_value_millions` as null.

Return the list of split items with specific Country and Operator for each.
"""

CONTRACT_EXTRACTOR_PROMPT_TEXT = """
You are a Defense Contract Financial Analyst.

TASK: Extract supplier, program type, value, and funding details.

**PROGRAM TYPE DEFINITIONS** (STRICT):
- **Procurement**: Buying NEW hardware, systems, or production units. (Keywords: production, procurement, delivery, manufacture).
- **RDT&E**: Design, testing, prototyping, BEFORE production. (Keywords: development, prototype, research, design).
- **MRO/Support**: Fixing/Sustaining EXISTING systems. (Keywords: maintenance, repair, overhaul, sustainment, logistics support, depot, modernization).
- **Training**: Purchasing training SERVICES (instruction/coaching). NOTE: Buying simulators is "Procurement".
- **Upgrade**: Adding NEW capabilities to existing platforms.
- **Other Service**: Services not covered above.

**SUPPLIER NAME**:
- Extract the company name exactly as written in the awardee section.

**VALUE**:
- Extract the total ceiling or face value in Millions. 
- Value Certainty: "Confirmed" if definite, "Estimated" for IDIQ/Ceilings.

**QUANTITY**:
- Extract the specific number of units if mentioned (e.g. "50 missiles"). If not quantifiable or services, use "Not Applicable".

**CURRENCY**:
- Default to "USD" if U.S. Dollar.

**G2G/B2G**:
- "G2G" ONLY if "Foreign Military Sales" (FMS) is mentioned. Otherwise "B2G".

Return valid JSON.
"""

GEOGRAPHY_PROMPT_TEXT = """
You are a Defense Geography Analyst. 
Extract the Customer Country, Customer Operator, and Supplier Country.

**RULES**:
1. **Customer Country**: 
   - The nation PAYING/RECEIVING. 
   - For FMS, it is the foreign nation (e.g. "FMS to Japan" -> Customer: Japan).
   - For Ukraine Assistance (US buys for Ukraine) -> Customer: USA.
2. **Customer Operator**:
   - Select from: Army, Navy, Air Force (includes Space Force), Defence Wide, Ukraine (Assistance), Foreign Assistance, Other.
   - If US buys for Ukraine -> Operator: "Ukraine (Assistance)".
3. **Supplier Country**:
   - The country where the Supplier Company is based. (e.g. BAE Systems Inc (USA) vs BAE Systems PLC (UK)).

Return JSON.
"""

SYSTEM_CLASSIFIER_PROMPT_TEXT = """
You are a Senior Defense System Classification Analyst.
1. **REFERENCE TAXONOMY**: {taxonomy_reference}
2. **TASK**:
   - Classify the system described into **Market Segment**, **System Type (General)**, and **System Name**.
   - **System Name (General)**: The Host Platform or Class (e.g., "F-35 Lightning II").
   - **System Name (Specific)**: The Specific Subject (e.g., "F-35A" or "Logistics Services for F-35").
   - **System Piloting**: Crewed, Uncrewed, Optional, Not Applicable.
"""

# ==============================================================================
# 3. PYDANTIC MODELS
# ==============================================================================

class SplitItem(BaseModel):
    customer_country: str = Field(..., description="The country for this split row (e.g. 'Japan').")
    customer_operator: str = Field(..., description="The operator (e.g. 'Navy', 'Air Force', 'Ukraine (Assistance)').")
    quantity_or_note: str = Field(..., description="Specific quantity or details for this split.")
    specific_value_millions: Optional[float] = Field(None, description="Only if the text explicitly assigns a dollar value to this portion.")

class SplitterResult(BaseModel):
    reasoning: str = Field(..., description="Why a split is or is not needed.")
    requires_split: bool = Field(..., description="True if multiple rows needed.")
    split_items: List[SplitItem] = Field(default_factory=list)

class GeographyResult(BaseModel):
    customer_country: str
    customer_operator: str
    supplier_country: str

class ContractResult(BaseModel):
    supplier_name_raw: str = Field(..., description="Raw supplier name.")
    program_type: Literal["Procurement", "RDT&E", "MRO/Support", "Training", "Upgrade", "Other Service", "Unknown"]
    value_millions: float
    value_certainty: Literal["Confirmed", "Estimated"]
    quantity: str = Field(default="Not Applicable", description="Number of units or 'Not Applicable'.")
    currency: str
    g2g_b2g: Literal["G2G", "B2G"]
    mro_duration_months: Optional[int] = Field(None, description="Only for MRO contracts.")

class SystemResult(BaseModel):
    market_segment: str
    system_type_general: str
    system_type_specific: str
    system_name_general: str
    system_name_specific: str
    piloting: Literal["Crewed", "Uncrewed", "Optional", "Not Applicable"]
    confidence: float

# ==============================================================================
# 4. HELPER FUNCTIONS
# ==============================================================================

def get_region(country: str) -> str:
    if not country: return "Unknown"
    # Simple direct lookup, can be expanded or use a library
    for k, v in REGION_MAP.items():
        if k.lower() in country.lower():
            return v
    # Fallback to broad logic or "Unknown"
    return "Unknown"

def clean_supplier_name(raw_name: str) -> str:
    if not raw_name or raw_name.lower() in ["unknown", "n/a"]:
        return "Unknown"
    if raw_name in VERIFIED_SUPPLIERS:
        return raw_name
    matches = difflib.get_close_matches(raw_name, VERIFIED_SUPPLIERS, n=1, cutoff=0.6)
    if matches:
        return matches[0]
    return raw_name

def get_domestic_content(cust_country, supp_country):
    if not cust_country or not supp_country: return "Unknown"
    if cust_country.lower() == supp_country.lower():
        return "Indigenous"
    return "Imported"

def format_currency(curr: str) -> str:
    if not curr: return "USD$"
    c = curr.strip().upper()
    if c == "USD": return "USD$"
    if c == "US DOLLAR": return "USD$"
    return c

def parse_contract_date(date_str):
    try:
        dt = parser.parse(str(date_str))
        return dt.strftime("%B"), dt.year
    except:
        return "Unknown", "Unknown"

def get_instructor_client():
    client = OpenAI(
        api_key=f'{os.environ.get("LLMFOUNDRY_TOKEN")}:my-test-project',
        base_url="https://llmfoundry.straive.com/openai/v1/"
    )
    return instructor.from_openai(client, mode=instructor.Mode.JSON)

# ==============================================================================
# 5. CORE PIPELINE LOGIC
# ==============================================================================

def process_single_contract(client, text: str, date_val: str, url_val: str):
    
    # --- 1. Splitter ---
    split_res = client.chat.completions.create(
        model="gpt-4o-mini",
        response_model=SplitterResult,
        messages=[{"role": "system", "content": SPLITTER_PROMPT_TEXT}, {"role": "user", "content": text}]
    )
    
    # --- 2. Contract Extraction ---
    contract_res = client.chat.completions.create(
        model="gpt-4o-mini",
        response_model=ContractResult,
        messages=[{"role": "system", "content": CONTRACT_EXTRACTOR_PROMPT_TEXT}, {"role": "user", "content": text}]
    )
    
    final_supplier = clean_supplier_name(contract_res.supplier_name_raw)
    final_currency = format_currency(contract_res.currency)
    
    # --- 3. System Classification ---
    system_res = client.chat.completions.create(
        model="gpt-4o-mini",
        response_model=SystemResult,
        messages=[{"role": "system", "content": SYSTEM_CLASSIFIER_PROMPT_TEXT.format(taxonomy_reference=TAXONOMY_STR)}, {"role": "user", "content": text}]
    )

    # --- 4. Logic & Row Assembly ---
    items_to_process = []
    
    if split_res.requires_split and split_res.split_items:
        # Split Logic
        total_val = contract_res.value_millions
        split_count = len(split_res.split_items)
        
        extracted_values = [item.specific_value_millions for item in split_res.split_items if item.specific_value_millions]
        has_specific_values = len(extracted_values) == split_count
        
        if has_specific_values:
            for item in split_res.split_items:
                items_to_process.append({
                    "country": item.customer_country,
                    "operator": item.customer_operator,
                    "value": item.specific_value_millions,
                    "certainty": "Confirmed",
                    "note": f"Split Item: {item.quantity_or_note}. Exact value.",
                    "split_flag": True
                })
        else:
            equal_share = round(total_val / split_count, 3)
            for item in split_res.split_items:
                items_to_process.append({
                    "country": item.customer_country,
                    "operator": item.customer_operator,
                    "value": equal_share,
                    "certainty": "Estimated",
                    "note": f"Split Item: {item.quantity_or_note}. Value divided equally.",
                    "split_flag": True
                })
    else:
        # No Split
        geo_res = client.chat.completions.create(
            model="gpt-4o-mini",
            response_model=GeographyResult,
            messages=[{"role": "system", "content": GEOGRAPHY_PROMPT_TEXT}, {"role": "user", "content": text}]
        )
        items_to_process.append({
            "country": geo_res.customer_country,
            "operator": geo_res.customer_operator,
            "supp_country_fallback": geo_res.supplier_country,
            "value": contract_res.value_millions,
            "certainty": contract_res.value_certainty,
            "note": "",
            "split_flag": False
        })

    results = []
    sign_month, sign_year = parse_contract_date(date_val)
    today_str = datetime.date.today().strftime("%Y-%m-%d")

    for item in items_to_process:
        # Determine Supplier Country
        if "supp_country_fallback" in item:
            s_country = item["supp_country_fallback"]
        else:
            # Re-run geo for supplier country if it was a split (geo_res not available in this scope)
            # Optimization: Just assume supplier country is same for all split items
            geo_res_temp = client.chat.completions.create(
                model="gpt-4o-mini",
                response_model=GeographyResult,
                messages=[{"role": "system", "content": GEOGRAPHY_PROMPT_TEXT}, {"role": "user", "content": text}]
            )
            s_country = geo_res_temp.supplier_country

        domestic_status = get_domestic_content(item["country"], s_country)
        
        # EXACT COLUMN MAPPING
        row = {
            "Customer Region": get_region(item["country"]),
            "Customer Country": item["country"],
            "Customer Operator": item["operator"],
            "Supplier Region": get_region(s_country),
            "Supplier Country": s_country,
            "Domestic Content": domestic_status,
            "Market Segment": system_res.market_segment,
            "System Type (General)": system_res.system_type_general,
            "System Type (Specific)": system_res.system_type_specific,
            "System Name (General)": system_res.system_name_general,
            "System Name (Specific)": system_res.system_name_specific,
            "System Piloting": system_res.piloting,
            "Supplier Name": final_supplier,
            "Program Type": contract_res.program_type,
            "Expected MRO Contract Duration (Months)": contract_res.mro_duration_months if contract_res.program_type == "MRO/Support" else "Not Applicable",
            "Quantity": contract_res.quantity,
            "Value Certainty": item["certainty"],
            "Value (Million)": item["value"],
            "Currency": final_currency,
            "Value (USD$ Million)": item["value"], # Assumption: Input is USD. If currency conversion needed, add logic here.
            "Value Note (If Any)": item["note"],
            "G2G/B2G": contract_res.g2g_b2g,
            "Signing Month": sign_month,
            "Signing Year": sign_year,
            "Description of Contract": text,
            "Additional Notes (Internal Only)": "Auto-extracted via Agentic AI",
            "Source Link(s)": url_val,
            "Contract Date": date_val,
            "Reported Date (By SGA)": today_str
        }
        results.append(row)
        
    return results

# ==============================================================================
# 6. EXECUTION
# ==============================================================================

def run_pipeline(input_path, output_path):
    print(f"Reading {input_path}...")
    df = pd.read_excel(input_path)
    client = get_instructor_client()
    
    all_data = []
    
    for idx, row in df.iterrows():
        print(f"Processing Row {idx+1}/{len(df)}...")
        desc = str(row.get("Contract Description", ""))
        date = str(row.get("Contract Date", ""))
        url = str(row.get("Source URL", ""))
        
        if len(desc) < 10: continue
        
        try:
            rows = process_single_contract(client, desc, date, url)
            all_data.extend(rows)
        except Exception as e:
            print(f"Error row {idx}: {e}")
            
    final_df = pd.DataFrame(all_data)
    
    # Enforce Column Order
    final_columns = [
        "Customer Region", "Customer Country", "Customer Operator", "Supplier Region", "Supplier Country",
        "Domestic Content", "Market Segment", "System Type (General)", "System Type (Specific)",
        "System Name (General)", "System Name (Specific)", "System Piloting", "Supplier Name",
        "Program Type", "Expected MRO Contract Duration (Months)", "Quantity", "Value Certainty",
        "Value (Million)", "Currency", "Value (USD$ Million)", "Value Note (If Any)",
        "G2G/B2G", "Signing Month", "Signing Year", "Description of Contract",
        "Additional Notes (Internal Only)", "Source Link(s)", "Contract Date", "Reported Date (By SGA)"
    ]
    
    # Reindex (creates missing cols if any, drops extras)
    final_df = final_df.reindex(columns=final_columns)
    
    final_df.to_csv(output_path, index=False)
    print(f"Done! Saved to {output_path}")

if __name__ == "__main__":
    # Create dummy input if needed
    if not os.path.exists("sample_data.xlsx"):
        print("Creating dummy file for testing...")
        pd.DataFrame({
            "Contract Description": [
                "Raytheon Missile and Defense, Tucson, Arizona, is awarded a $328,156,454 fixed-price incentive (firm target) contract. This contract provides for the production and delivery of Lot 21 as follows: 483 AIM-9X Block II all up round tactical missiles (212 for the Navy, 187 for the Air Force and 84 for Foreign Military Sales (FMS) customers); 82 AIM-9X block II plus all up round missiles (eight for the Navy, eight for the Air Force and 66 for FMS customers); 156 Block II Captive Air Training Missiles (82 for the Air Force and 74 for FMS customers); eight Block II Special Air Training Missiles (two for the Air Force and six for FMS customers); 198 all up round containers (75 for the Navy, 73 for the Air Force and 50 for FMS customers); six spare advanced optical target detectors (two for the Air Force and four for FMS customers); five spare advanced optical target detector containers (two for the Air Force and three for FMS customers); 29 spare Block II guidance units (live battery) (13 for the Navy, four for the Air Force, and 12 for FMS customers); six spare Block II plus guidance units (live battery) for FMS customers; 41 guidance unit containers for FMS customers; 72 spare Captive Air Training Missile guidance units (inert battery) (22 for the Navy, three for the Air Force, and 47 for FMS customers); two spare Block I propulsion steering sections for the Air Force; seven spare Block II propulsion steering sections (two for the Navy, four for the Air Force, and one for FMS customers); 72 spare Block II electronics units (68 for the Navy and four for the Air Force); two classroom explosive ordnance disposal systems trainers for FMS customers; one practical explosive ordnance disposal systems trainer for a FMS customer; 11 multi-purpose training missiles for various FMS customers; 135 tail caps (eight for the Navy, 16 for the Air Force and 111 for FMS customers); 35 tail cap containers (two for the Navy, four for the Air Force, and 29 for FMS customers); one lot of spares assets for the Navy; one lot of spares assets for the Air Force; and one lot of spares assets for the governments of Australia, Bahrain, Belgium, Bulgaria, Canada, Denmark, Finland, Indonesia, Israel, Japan, Kuwait, Malaysia, Morocco, the Netherlands, Norway, Oman, Poland, Qatar, Romania, Saudi Arabia, Singapore, Slovakia, Republic of  Korea, Taiwan, Turkey, and the United Arab Emirates.  Work will be performed in Tucson, Arizona (31%); North Logan, Utah (10%); Keyser, West Virginia (9%); Niles, Illinois (8%); Vancouver, Washington (5%); Ottawa, Ontario, Canada (5%); Goleta, California (4%); Cheshire, Connecticut (4%); Heilbronn, Germany (3%); Simsbury, Connecticut (2%); San Jose, California (2%); Valencia, California (2%); Anaheim, California (2%); Cajon, California (2%); Cincinnati, Ohio (1%); Anniston, Alabama (1%); San Diego, California (1%); Chatsworth, California (1%); Amesbury, Massachusetts (1%); Claremont, California (1%); Sumner, Washington (1%); and various locations within the continental U.S. (4%), and is expected to be completed in June 2024.  Fiscal 2021 weapons procurement (Navy) funds in the amount of $98,204,232; fiscal 2021 missile procurement (Air Force) funds in the amount of $102,681,830; fiscal 2021 research, development, test and evaluation (Air Force) funds in the amount of $802,382; fiscal 2020 missile procurement (Air Force) funds in the amount of $257,638; fiscal 2020 weapons procurement (Navy) funds in the amount of $108,826; fiscal 2019 missile procurement (Air Force) in the amount of $295,576; and FMS funds in the amount of $125,805,970 will be obligated at the time of award, $295,576 of which will expire at the end of the current fiscal year. This contract was not competitively procured pursuant to Federal Acquisition Regulation 6.302-1. The Naval Air Systems Command, Patuxent River, Maryland, is the contracting activity (N0001921C0723).",
                
            ],
            "Contract Date": ["2021-30-06"],
            "Source URL": ["https://www.defense.gov/News/Contracts/Contract/Article/2678227/"]
        }).to_excel("sample_data.xlsx", index=False)
        
    run_pipeline("sample_data.xlsx", "Verified_Output.csv")

Reading sample_data.xlsx...
Processing Row 1/1...
Done! Saved to Verified_Output.csv
