In [None]:
## Update this in your finanacial prompt

FINANCIAL_PROMPT = """
You are a defence contract financial and program analyst.
YOUR TASK: Extract supplier, program info, and financials.

""" + GOLD_STANDARD_EXAMPLES + """

STRICT RULES:

1. **Program Type (CHOOSE EXACTLY ONE)**:
   - **Training**: Purchase of military training *services*. Note: Purchase of training aircraft or simulators falls under "Procurement".
   - **Procurement**: Acquisition of new products, systems, or production kits.
     IMPORTANT: Includes contract modifications and services performed on test/prototype articles for development support.
   - **MRO/Support**: Maintenance, Repair, and Operations. Select ONLY for sustainment/repair of operational fielded equipment.
   - **RDT&E**: Research, prototyping, testing where output is design validation/knowledge rather than a delivered fielded system.
   - **Upgrade**: Purchase of components/services to modernize existing equipment.
   - **Other Service**: Consulting, IT support, or non-lifecycle general services.
   - **Unknown**: If cannot be determined.

2. **Supplier Name (MOST CRITICAL RULES)**:
   - Extract the entity that has been **AWARDED** the contract / performing the scope of work.
   - Output ONLY the **clean brand name** (taxonomy-friendly):
     ✅ Remove legal suffixes (Inc, LLC, Ltd, Co., Corporation, Corp)
     ✅ Remove locations (city/state/country)
     ✅ Remove internal business units unless extremely important
   - Supplier must be a **single organization name only**. No commas, no addresses, no extra text.
   - If multiple companies appear, pick the one **receiving the award** (the contractor).
   - If unsure, extract the **first supplier/company name** mentioned.

3. **Quantity (Crucial)**:
   - **Hardware/Missiles:** Extract total count. If multiple numbers exist for same system, SUM them.
   - **Services/RDT&E/IT:** Use "Not Applicable".
   - **Unknown:** If hardware procurement exists but no number given.

4. **Value Calculation**:
   - Convert to **MILLIONS**
   - Round to **3 decimals**
   - Do NOT include currency symbols or words
   - Example: $2,493,000,000 -> "2493.000"

5. **Value Certainty**:
   - DEFAULT = "Confirmed"
   - Use "Estimated" ONLY when explicitly stated as potential/approximate/projected without a confirmed award value.

6. **Description Date Found**:
   - Only for MRO contracts: extract completion/end date if present, else empty.

Return JSON ONLY with these exact keys:
{
  "Supplier Name": "...",
  "Program Type": "...",
  "Quantity": "...",
  "Value Certainty": "...",
  "Value (Million)": "...", 
  "Currency": "USD$",
  "Description Date Found": "..."
}

Description:
\"\"\"
{text}
\"\"\"
"""


In [None]:
## Replace this entir in processor file

import json
import datetime
import time
import pandas as pd
import re
import difflib
import os
from dateutil import parser
from dateutil.relativedelta import relativedelta
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from openai import OpenAI

# Imports from other files
from config import MODEL_NAME, BASE_URL
from data.taxonomy import (
    TAXONOMY_STR, VALID_DEPENDENCIES, GEOGRAPHY_MAPPING,
    VALID_OPERATORS, SUPPLIER_LIST, PROGRAM_TYPES, DOMESTIC_CONTENT_OPTIONS
)
from src.prompts import (
    GEOGRAPHY_PROMPT, FINANCIAL_PROMPT, DOMESTIC_CONTENT_PROMPT
)

# ==========================================
# 0. INITIALIZE TF-IDF MEMORY (NEW LOGIC)
# ==========================================
# ⚠️ IMPORTANT (Cloud fix suggestion):
# Don't hardcode local Windows path in Streamlit Cloud.
# Keep fallback logic so memory loads when file is present in repo.
REFERENCE_FILE_PATH = r"C:\Users\mukeshkr\Desktop\DefenseExtraction\Market Segment.xlsx"

print("Loading Reference Examples for Memory...")
vectorizer = None
example_vectors = None
df_examples = None

try:
    if os.path.exists(REFERENCE_FILE_PATH):
        df_examples = pd.read_excel(REFERENCE_FILE_PATH)
        if 'Description of Contract' in df_examples.columns:
            vectorizer = TfidfVectorizer(stop_words='english')
            example_vectors = vectorizer.fit_transform(df_examples['Description of Contract'].astype(str))
            print("Success: Memory loaded with TF-IDF Vectorizer.")
        else:
            print(f"Warning: Column 'Description of Contract' not found in {REFERENCE_FILE_PATH}")
    else:
        local_fallback = "Market Segment.xlsx"
        if os.path.exists(local_fallback):
            df_examples = pd.read_excel(local_fallback)
            vectorizer = TfidfVectorizer(stop_words='english')
            example_vectors = vectorizer.fit_transform(df_examples['Description of Contract'].astype(str))
            print(f"Success: Memory loaded from local fallback '{local_fallback}'.")
        else:
            print(f"Warning: Reference file not found at {REFERENCE_FILE_PATH} or {local_fallback}. Memory disabled.")
except Exception as e:
    print(f"CRITICAL WARNING: Could not load Reference File. Error: {e}")

# ==========================================
# 1. HELPER FUNCTIONS
# ==========================================

SORTED_SUPPLIER_LIST = sorted(SUPPLIER_LIST, key=len, reverse=True)

def normalize_supplier_name(name: str) -> str:
    """
    Normalize supplier for better taxonomy matching.
    Keeps brand name but removes junk formatting.
    """
    if not name:
        return ""

    name = str(name).strip()

    # Remove leading The
    name = re.sub(r"^the\s+", "", name, flags=re.IGNORECASE)

    # Remove bracket info
    name = re.sub(r"\(.*?\)", "", name)

    # Remove common legal suffixes
    name = re.sub(
        r"\b(inc|llc|ltd|limited|corp|corporation|co|company|gmbh|s\.a\.|plc)\b\.?",
        "",
        name,
        flags=re.IGNORECASE
    )

    # Remove trailing comma section (often location)
    name = name.split(",")[0].strip()

    # Normalize spaces
    name = re.sub(r"\s+", " ", name).strip()

    return name


def get_best_taxonomy_match(extracted_name: str) -> str:
    """
    ✅ FIXED:
    Old logic was overriding good supplier values into Unknown.
    Now it:
    - matches taxonomy if possible
    - otherwise returns cleaned supplier name (instead of destroying it)
    """
    if extracted_name is None:
        return "Unknown"

    extracted_name = str(extracted_name).strip()

    if extracted_name == "" or extracted_name.lower() in ["unknown", "not applicable", "multiple", "n/a"]:
        return "Unknown"

    clean_name = normalize_supplier_name(extracted_name)
    clean_lower = clean_name.lower()

    # 1) Exact match
    for supplier in SORTED_SUPPLIER_LIST:
        if clean_lower == supplier.lower():
            return supplier

    # 2) Substring match (very reliable)
    for supplier in SORTED_SUPPLIER_LIST:
        if supplier.lower() in clean_lower:
            return supplier

    # 3) Fuzzy match (high cutoff)
    fuzzy = difflib.get_close_matches(clean_name, SORTED_SUPPLIER_LIST, n=1, cutoff=0.75)
    if fuzzy:
        return fuzzy[0]

    # ✅ IMPORTANT FIX:
    # If taxonomy doesn't match, return the cleaned extracted name (NOT Unknown)
    return clean_name


def call_llm(prompt_text: str, system_message: str = "You are a helpful assistant. Please respond in JSON format.") -> dict:
    time.sleep(0.5)
    try:
        client = OpenAI(base_url=BASE_URL)

        response = client.chat.completions.create(
            model=MODEL_NAME,
            messages=[
                {"role": "system", "content": system_message},
                {"role": "user", "content": prompt_text}
            ],
            temperature=0,
            response_format={"type": "json_object"}
        )

        content = response.choices[0].message.content
        return json.loads(content)

    except Exception as e:
        print(f"LLM Call Error: {e}")
        return {}


def get_similar_example(new_text):
    """
    Finds the single most similar contract from the analyst file using TF-IDF.
    Returns the text and the correct classification.
    """
    if vectorizer is None or df_examples is None:
        return None

    try:
        new_vec = vectorizer.transform([new_text])
        similarities = cosine_similarity(new_vec, example_vectors).flatten()
        best_idx = similarities.argmax()

        # Only use example if it's somewhat similar (score > 0.1)
        if similarities[best_idx] > 0.1:
            row = df_examples.iloc[best_idx]
            return {
                "text": row['Description of Contract'],
                "classification": {
                    "Market Segment": row['Market Segment'],
                    "System Type (General)": row['System Type (General)'],
                    "System Type (Specific)": row['System Type (Specific)'],
                    "System Name (General)": row['System Name (General)'],
                    "System Name (Specific)": row['System Name (Specific)'],
                    "System Piloting": row.get('System Piloting', "Derived from logic")
                }
            }
    except Exception as e:
        print(f"Error finding similar example: {e}")

    return None


def calculate_derived_fields(financial_data: dict, geo_data: dict, description: str, contract_date_str: str) -> dict:
    try:
        start_date = pd.to_datetime(contract_date_str, dayfirst=True)
    except Exception:
        start_date = datetime.datetime.today()

    signing_month = start_date.strftime("%B")
    signing_year = str(start_date.year)

    val_llm_output = financial_data.get("Value (Million)", "0.000")
    try:
        clean_val = str(val_llm_output).replace(",", "").replace("$", "").replace("M", "").strip()
        val_float = float(clean_val)
        val_formatted = "{:.3f}".format(val_float)
    except (ValueError, TypeError):
        val_formatted = "0.000"

    cust_country = geo_data.get("Customer Country", "Unknown")
    supp_country = geo_data.get("Supplier Country", "Unknown")
    deal_type = "B2G" if (cust_country == "USA" and supp_country == "USA") else "G2G"

    program_type = financial_data.get("Program Type", "Other Service")
    mro_duration = "Not Applicable"

    if program_type == "MRO/Support":
        desc_date_str = financial_data.get("Description Date Found", "")
        if desc_date_str and desc_date_str.strip() != "":
            try:
                end_date = parser.parse(desc_date_str, fuzzy=True)
                diff = relativedelta(end_date, start_date)
                total_months = diff.years * 12 + diff.months
                mro_duration = str(max(0, int(total_months)))
            except Exception:
                try:
                    months_match = re.search(r'(\d+)\s*months?', desc_date_str, re.IGNORECASE)
                    years_match = re.search(r'(\d+)\s*years?', desc_date_str, re.IGNORECASE)
                    if months_match:
                        mro_duration = months_match.group(1)
                    elif years_match:
                        mro_duration = str(int(years_match.group(1)) * 12)
                except Exception:
                    mro_duration = "Unknown"

    qty = financial_data.get("Quantity", "Not Applicable")
    if program_type != "Procurement":
        qty = "Not Applicable"

    return {
        "Supplier Name": financial_data.get("Supplier Name", "Unknown"),
        "Program Type": program_type,
        "Expected MRO Contract Duration (Months)": mro_duration,
        "Quantity": qty,
        "Value Certainty": financial_data.get("Value Certainty", "Confirmed"),
        "Value (Million)": val_formatted,
        "Currency": "USD$",
        "Value (USD$ Million)": val_formatted,
        "G2G/B2G": deal_type,
        "Signing Month": signing_month,
        "Signing Year": signing_year
    }

# ==========================================
# 2. MAIN PROCESSOR (UPDATED LOGIC)
# ==========================================

def classify_record_with_memory(description: str, contract_date_str: str) -> dict:
    """
    Main entry point for processing a single row.
    Integrates:
    1. New 'Memory' Logic for Classification, Naming, Piloting.
    2. Existing Logic for Geography, Domestic Content, Financials.
    """

    # --- A. NEW CLASSIFICATION LOGIC (Market Segment, Systems, Piloting) ---

    similar_case = get_similar_example(description)

    system_instruction = f"""
    You are a Defense Contract Analyst.
    Your goal is to extract technical data points from the "Input Text".

    REFERENCE TAXONOMY:
    {TAXONOMY_STR}
    """

    user_message = f"Input Text: {description}\n\n"

    if similar_case:
        user_message += f"""
        IMPORTANT REFERENCE - Here is a similar contract classified by a human analyst.
        Use this as a guide for your logic:

        [Past Input]: {similar_case['text'][:300]}...
        [Past Correct Output]: {json.dumps(similar_case['classification'])}

        Now, apply the same logic to the current Input Text.
        """

    user_message += """
    --------------------------------------------------------
    REQUIREMENTS:
    1. Classify 'Market Segment', 'System Type (General)', 'System Type (Specific)' using the Taxonomy.
    2. Extract 'System Name (Specific)' (e.g., MC-130J) and 'System Name (General)' (e.g., C-130).
    3. Determine 'System Piloting' (Crewed, Uncrewed, or Not Applicable).
       - Software/Services/Ammo/Infra = "Not Applicable".
       - Manned Vehicles = "Crewed".
       - Drones/Satellites = "Uncrewed".

    Return JSON only with these exact keys:
    {
        "Market Segment": "...",
        "System Type (General)": "...",
        "System Type (Specific)": "...",
        "System Name (General)": "...",
        "System Name (Specific)": "...",
        "System Piloting": "..."
    }
    """

    class_result = call_llm(user_message, system_instruction)

    # --- B. EXISTING LOGIC (Geography, Domestic, Financials) ---

    # 1. GEOGRAPHY
    geo_json_str = json.dumps(GEOGRAPHY_MAPPING)
    geo_prompt = GEOGRAPHY_PROMPT.format(
        operators=VALID_OPERATORS,
        geo_mapping=geo_json_str,
        text=description
    )
    geo_result = call_llm(geo_prompt)

    # 2. DOMESTIC CONTENT
    cust_c = geo_result.get("Customer Country", "Unknown")
    supp_c = geo_result.get("Supplier Country", "Unknown")

    dom_prompt = DOMESTIC_CONTENT_PROMPT.format(
        supplier_country=supp_c,
        customer_country=cust_c,
        options=DOMESTIC_CONTENT_OPTIONS,
        text=description
    )
    dom_result_raw = call_llm(dom_prompt)
    dom_val = dom_result_raw.get("Domestic Content", "Imported")

    if cust_c.lower() == supp_c.lower() and cust_c != "Unknown":
        dom_val = "Indigenous"

    if dom_val not in DOMESTIC_CONTENT_OPTIONS:
        dom_val = "Imported"
    dom_result = {"Domestic Content": dom_val}

    # 3. FINANCIALS
    fin_prompt = FINANCIAL_PROMPT.format(
        program_types=PROGRAM_TYPES,
        supplier_list=", ".join(SUPPLIER_LIST),
        text=description
    )
    fin_result_raw = call_llm(fin_prompt)

    # ✅ Supplier fallback if LLM returns unknown/blank
    if fin_result_raw.get("Supplier Name", "").strip().lower() in ["", "unknown", "not applicable", "n/a", "multiple"]:
        desc_lower = description.lower()
        for supplier in SORTED_SUPPLIER_LIST:
            if supplier.lower() in desc_lower:
                fin_result_raw["Supplier Name"] = supplier
                break

    # ✅ Match supplier to taxonomy safely (won't destroy good extraction)
    raw_llm_supplier = fin_result_raw.get("Supplier Name", "Unknown")
    matched_taxonomy_name = get_best_taxonomy_match(raw_llm_supplier)
    fin_result_raw["Supplier Name"] = matched_taxonomy_name

    # 4. DERIVED FIELDS calculation
    derived_result = calculate_derived_fields(fin_result_raw, geo_result, description, contract_date_str)

    # --- C. MERGE ALL RESULTS ---
    final_output = {
        **class_result,
        **geo_result,
        **dom_result,
        **derived_result
    }

    return final_output
