In [None]:
!pip install openai langchain langchain-community langchain-core PyPDF2 accelerate bitsandbytes llama_index huggingface_hub chromadb groq anthropic python-docx pypandoc markitdown pytesseract pdf2image python-docx pymupdf pillow pycryptodome==3.15.0



In [None]:
from google.colab import drive
drive.mount('/GD', force_remount=True)

Mounted at /GD


# With Pandoc

In [None]:
import os
import re
import tempfile
from concurrent.futures import ThreadPoolExecutor, as_completed

from PyPDF2 import PdfReader
from langchain.chat_models import ChatOpenAI
from langchain.schema import HumanMessage
from google.colab import userdata

import openai
import pypandoc

# ============================ CONFIG ============================

openai.api_key = userdata.get('OpenaAIAPI')  # make sure this is set
llm_model = "gpt-4.1"

# Colab paths
pdf_path = "/GD/My Drive/JITC/military_standards_pdf/MIL-STD-188-203-1A_1995.pdf"
output_docx_path = "/GD/My Drive/JITC/military_standards_pdf/MIL-STD-188-Auto-TestPlan.docx"
output_markdown_path = output_docx_path.replace(".docx", ".md")  # also save the master .md

# ============================ HELPERS ============================

def ensure_pandoc():
    """
    Ensures pandoc is available. In Colab, prefer apt-get install pandoc.
    Fallback to pypandoc downloader if needed.
    """
    try:
        _ = pypandoc.get_pandoc_path()
    except OSError:
        print("Pandoc not found by pypandoc. Attempting to download via pypandoc...")
        pypandoc.download_pandoc()

def extract_sections_from_pdf(pdf_path, start_page=12, pages_per_section=10):
    """
    Returns:
      sections: dict[str, str] -> { "Section 1: Pages x-y": "<text...>", ... }
      section_names: list[str] -> preserves insertion order for later serialization
    """
    reader = PdfReader(pdf_path)
    num_pages = len(reader.pages)
    sections = {}
    section_idx = 1
    for i in range(start_page - 1, num_pages, pages_per_section):
        section_text = []
        section_pages = []
        for j in range(i, min(i + pages_per_section, num_pages)):
            section_pages.append(j + 1)
            extracted = reader.pages[j].extract_text() or ""
            section_text.append(extracted)
        section_title = f"Section {section_idx}: Pages {section_pages[0]}-{section_pages[-1]}"
        sections[section_title] = "\n".join(section_text).strip()
        section_idx += 1
    section_names = list(sections.keys())  # preserve intended order
    return sections, section_names

def extract_rules_for_section(section_name, section_text):
    """
    LLM agent that emits well-structured Markdown with:
    ## [Section Title]
    **Dependencies:**
    **Conflicts:**
    **Test Rules:**
    """
    llm = ChatOpenAI(model_name=llm_model, openai_api_key=openai.api_key)
    prompt = (
        f"You are a MIL-STD-188 compliance and test planning expert.\n"
        f"Analyze the following section of a military standard and extract EVERY possible testable rule, specification, constraint, or requirement. "
        f"Rules MUST be extremely detailed, explicit, and step-by-step, and should include measurable criteria, acceptable ranges, and referenced figures or tables if mentioned. "
        f"For ambiguous or implicit requirements, describe a specific test strategy.\n"
        f"Generate a short, content-based TITLE for this section (do not use page numbers).\n"
        f"Organize your output as follows, using markdown headings and bolded text:\n\n"
        f"## [Section Title]\n"
        f"**Dependencies:**\n- List detailed dependencies as explicit tests, if any.\n\n"
        f"**Conflicts:**\n- List detected or possible conflicts and provide recommendations or mitigation steps.\n\n"
        f"**Test Rules:**\n1. (Very detailed, step-by-step numbered test rules)\n"
        f"\nSection Name: {section_name}\n\nSection Text:\n{section_text}\n\n"
        f"---\n"
        f"If you find truly nothing testable, reply: 'No testable rules in this section.'"
    )
    result = llm([HumanMessage(content=prompt)])
    return section_name, result.content

def synthesize_pairwise_test_plan(pair_name, rules_text_1, rules_text_2):
    """
    LLM agent that merges two adjacent sections' rules into one combined Markdown block.
    """
    llm = ChatOpenAI(model_name=llm_model, openai_api_key=openai.api_key)
    prompt = (
        "You are a senior QA documentation engineer.\n"
        "Given the DETAILED test rules and extracted section titles for two consecutive MIL-STD-188 sections, synthesize a single, logically organized, highly detailed test plan section.\n"
        "Combine rules, merge similar steps, cross-reference overlapping content, and call out dependencies or conflicts. "
        "Use a single, **content-based TITLE** for this combined section (not using page numbers). "
        "Keep bold markdown headings for 'Dependencies', 'Conflicts', and 'Test Rules'.\n"
        "Test rules must be extremely explicit, step-by-step, and cover ALL possible technical details and verification steps.\n"
        "Format the output using markdown.\n\n"
        "=== SECTION 1 ===\n" + rules_text_1 + "\n\n=== SECTION 2 ===\n" + rules_text_2 +
        "\n\n=== END ===\n\nOutput ONLY the combined test plan in the described format."
    )
    result = llm([HumanMessage(content=prompt)])
    return pair_name, result.content

def _sanitize_markdown(md: str) -> str:
    """
    Normalize some common LLM artifacts so Pandoc produces perfect lists/headings:
    - Convert emoji bullets to '-' bullets.
    - Ensure numbered lists like '1)' -> '1.' for consistency.
    - Trim excess whitespace.
    """
    # Replace common emoji bullets with hyphen bullets
    md = md.replace("🔹 ", "- ")
    md = md.replace("• ", "- ")
    md = md.replace("– ", "- ")

    # Normalize numbered lists that use ')'
    md = re.sub(r'^(\s*)(\d+)\)\s+', r'\1\2. ', md, flags=re.MULTILINE)

    # Avoid weird bold markers separated by spaces: ** bold ** -> **bold**
    md = re.sub(r'\*\*\s+(.*?)\s+\*\*', r'**\1**', md)

    return md.strip() + "\n"

def build_master_markdown(section_names, section_rule_reports, pairwise_test_plans) -> str:
    """
    Build a single ordered Markdown document:
      - Title page
      - (Pandoc-generated TOC)
      - Each section's rules, followed immediately by its pairwise plan (if any)
    """
    parts = []
    parts.append("# MIL-STD-188 Automated Compliance Test Plan\n")
    parts.append("> Generated via automated extraction and synthesis pipeline.\n")
    parts.append("\n\\newpage\n")  # Pandoc page break

    # Serialize in the ORIGINAL section order (not the parallel completion order)
    for sec in section_names:
        rules_md = section_rule_reports.get(sec, "").strip()
        if rules_md:
            parts.append(_sanitize_markdown(rules_md))
            parts.append("\n")

        if sec in pairwise_test_plans:
            combined_md = pairwise_test_plans[sec].strip()
            parts.append(_sanitize_markdown(combined_md))
            parts.append("\n")

        parts.append("\n\\newpage\n")

    return "\n".join(parts)

def write_docx_with_pandoc(markdown_text: str, output_docx_path: str, reference_docx: str = None, also_save_md: str = None):
    ensure_pandoc()

    if also_save_md:
        os.makedirs(os.path.dirname(also_save_md), exist_ok=True)
        with open(also_save_md, "w", encoding="utf-8") as f:
            f.write(markdown_text)

    import tempfile, pypandoc, subprocess

    # Helpful for debugging your environment:
    try:
        ver = subprocess.check_output(["pandoc", "-v"]).decode().splitlines()[0]
        print(f"[pandoc] {ver}")
    except Exception:
        pass

    # Write temp md
    with tempfile.NamedTemporaryFile(suffix=".md", delete=False, mode="w", encoding="utf-8") as tmp:
        tmp.write(markdown_text)
        tmp_md_path = tmp.name

    extra_args = ["--toc", "--toc-depth=3", "--number-sections"]
    if reference_docx:
        extra_args += ["--reference-doc", reference_docx]

    # Try more compatible formats first
    candidate_from_formats = [
        # Very compatible:
        "gfm+pipe_tables+autolink_bare_uris",
        # Plain markdown without the problematic extension:
        "markdown+pipe_tables+autolink_bare_uris",
        # Fenced code and basic extras:
        "markdown+fenced_code_blocks+autolink_bare_uris+pipe_tables",
    ]

    last_err = None
    for frm in candidate_from_formats:
        try:
            pypandoc.convert_file(
                tmp_md_path,
                to="docx",
                outputfile=output_docx_path,
                extra_args=extra_args,
                format=frm
            )
            print(f"Test plan saved to {output_docx_path} (from={frm})")
            return
        except RuntimeError as e:
            last_err = e
            print(f"[pandoc retry] failed with from={frm} -> {e}")

    # If all fail, surface the last error
    raise last_err


# ============================ MAIN ============================

if __name__ == "__main__":
    # 1) Extract sections (skip first 11 pages; 10 per section)
    sections, section_names = extract_sections_from_pdf(pdf_path, start_page=12, pages_per_section=10)

    # 2) PARALLEL: Extract detailed rules per section
    section_rule_results = {}
    with ThreadPoolExecutor(max_workers=8) as executor:
        futures = [executor.submit(extract_rules_for_section, name, text) for name, text in sections.items()]
        for fut in as_completed(futures):
            section_name, rules_md = fut.result()
            section_rule_results[section_name] = rules_md
            print(f"\n--- {section_name} ---\n{rules_md}\n{'='*60}")

    # 3) PARALLEL: Create pairwise test plans for adjacent sections
    pairwise_test_plans = {}
    with ThreadPoolExecutor(max_workers=4) as executor:
        futures = []
        for i in range(len(section_names) - 1):
            s1 = section_names[i]
            s2 = section_names[i + 1]
            rules1 = section_rule_results.get(s1, "")
            rules2 = section_rule_results.get(s2, "")
            futures.append(executor.submit(synthesize_pairwise_test_plan, s1, rules1, rules2))
        for fut in as_completed(futures):
            pair_key, combined_md = fut.result()
            pairwise_test_plans[pair_key] = combined_md
            print(f"\n--- Pair: {pair_key} ---\n{combined_md}\n{'='*60}")

    # 4) Build a single ordered Markdown and convert with Pandoc
    master_md = build_master_markdown(section_names, section_rule_results, pairwise_test_plans)
    write_docx_with_pandoc(
        master_md,
        output_docx_path=output_docx_path,
        reference_docx=None,            # Optional: supply a custom Word styles template
        also_save_md=output_markdown_path
    )


  llm = ChatOpenAI(model_name=llm_model, openai_api_key=openai.api_key)
  result = llm([HumanMessage(content=prompt)])



--- Section 6: Pages 62-71 ---
## Acronym and TADIL A System Operation, Data Transmission, and DTS-to-TDS Interface Specifications

**Dependencies:**
1. **MIL-STD-1397 Compliance:**  
   - The TDS computer interface and electrical characteristics must be tested for full compliance with MIL-STD-1397, specifically for Type A, Category I (NTDS SLOW - Computer to Peripheral) interface.
   - Obtain and reference MIL-STD-1397 for all signal definitions, timings, and electrical characteristics.

2. **Referenced Tables and Figures:**  
   - TABLE VI: Error status summary bit definitions (see also 5.2.4.1 and TABLE V for Ko-K16 definitions).
   - TABLE VII: Bit assignments for data and interrupt code words.
   - TABLE VIII: External interrupt codes.
   - FIGURE 9: Representative TADIL A net (for net setup and operation validation).
   - FIGURE 10: Representative TADIL A system configuration (for equipment interconnections).
   - FIGURE 11: DTS-to-TDS computer interface (for signal and line val

# Pandoc + Test Card

In [None]:
import os
import re
import tempfile
from concurrent.futures import ThreadPoolExecutor, as_completed

from PyPDF2 import PdfReader
from langchain.chat_models import ChatOpenAI
from langchain.schema import HumanMessage
from google.colab import userdata

import openai
import pypandoc

# ============================ CONFIG ============================

openai.api_key = userdata.get('OpenaAIAPI')  # make sure this is set
llm_model = "gpt-4.1"

# Colab paths
pdf_path = "/GD/My Drive/JITC/military_standards_pdf/MIL-STD-188-203-1A_1995.pdf"
output_docx_path = "/GD/My Drive/JITC/military_standards_pdf/MIL-STD-188-Auto-TestPlan.docx"
output_markdown_path = output_docx_path.replace(".docx", ".md")  # also save the master .md

# ============================ HELPERS ============================

def ensure_pandoc():
    """
    Ensures pandoc is available. In Colab, prefer apt-get install pandoc.
    Fallback to pypandoc downloader if needed.
    """
    try:
        _ = pypandoc.get_pandoc_path()
    except OSError:
        print("Pandoc not found by pypandoc. Attempting to download via pypandoc...")
        pypandoc.download_pandoc()

def extract_sections_from_pdf(pdf_path, start_page=12, pages_per_section=10):
    """
    Returns:
      sections: dict[str, str] -> { "Section 1: Pages x-y": "<text...>", ... }
      section_names: list[str] -> preserves insertion order for later serialization
    """
    reader = PdfReader(pdf_path)
    num_pages = len(reader.pages)
    sections = {}
    section_idx = 1
    for i in range(start_page - 1, num_pages, pages_per_section):
        section_text = []
        section_pages = []
        for j in range(i, min(i + pages_per_section, num_pages)):
            section_pages.append(j + 1)
            extracted = reader.pages[j].extract_text() or ""
            section_text.append(extracted)
        section_title = f"Section {section_idx}: Pages {section_pages[0]}-{section_pages[-1]}"
        sections[section_title] = "\n".join(section_text).strip()
        section_idx += 1
    section_names = list(sections.keys())  # preserve intended order
    return sections, section_names

def extract_rules_for_section(section_name, section_text):
    """
    LLM agent that emits well-structured Markdown with:
    ## [Section Title]
    **Dependencies:**
    **Conflicts:**
    **Test Rules:**
    """
    llm = ChatOpenAI(model_name=llm_model, openai_api_key=openai.api_key)
    prompt = (
        f"You are a MIL-STD-188 compliance and test planning expert.\n"
        f"Analyze the following section of a military standard and extract EVERY possible testable rule, specification, constraint, or requirement. "
        f"Rules MUST be extremely detailed, explicit, and step-by-step, and should include measurable criteria, acceptable ranges, and referenced figures or tables if mentioned. "
        f"For ambiguous or implicit requirements, describe a specific test strategy.\n"
        f"Generate a short, content-based TITLE for this section (do not use page numbers).\n"
        f"Organize your output as follows, using markdown headings and bolded text:\n\n"
        f"## [Section Title]\n"
        f"**Dependencies:**\n- List detailed dependencies as explicit tests, if any.\n\n"
        f"**Conflicts:**\n- List detected or possible conflicts and provide recommendations or mitigation steps.\n\n"
        f"**Test Rules:**\n1. (Very detailed, step-by-step numbered test rules)\n"
        f"\nSection Name: {section_name}\n\nSection Text:\n{section_text}\n\n"
        f"---\n"
        f"If you find truly nothing testable, reply: 'No testable rules in this section.'"
    )
    result = llm([HumanMessage(content=prompt)])
    return section_name, result.content

def synthesize_pairwise_test_plan(pair_name, rules_text_1, rules_text_2):
    """
    LLM agent that merges two adjacent sections' rules into one combined Markdown block.
    """
    llm = ChatOpenAI(model_name=llm_model, openai_api_key=openai.api_key)
    prompt = (
        "You are a senior QA documentation engineer.\n"
        "Given the DETAILED test rules and extracted section titles for two consecutive MIL-STD-188 sections, synthesize a single, logically organized, highly detailed test plan section.\n"
        "Combine rules, merge similar steps, cross-reference overlapping content, and call out dependencies or conflicts. "
        "Use a single, **content-based TITLE** for this combined section (not using page numbers). "
        "Keep bold markdown headings for 'Dependencies', 'Conflicts', and 'Test Rules'.\n"
        "Test rules must be extremely explicit, step-by-step, and cover ALL possible technical details and verification steps.\n"
        "Format the output using markdown.\n\n"
        "=== SECTION 1 ===\n" + rules_text_1 + "\n\n=== SECTION 2 ===\n" + rules_text_2 +
        "\n\n=== END ===\n\nOutput ONLY the combined test plan in the described format."
    )
    result = llm([HumanMessage(content=prompt)])
    return pair_name, result.content

def build_test_card_for_section(section_name, rules_markdown):
    """
    Ask the LLM to transform the 'Test Rules' into a single Markdown pipe-table
    'Test Card' with blank checkboxes. Output ONLY the table (no prose).
    Columns:
      - Test ID
      - Test Title
      - Procedures (compact numbered steps; use <br> for line breaks)
      - Executed
      - Pass
      - Fail
      - Notes
    """
    llm = ChatOpenAI(model_name=llm_model, openai_api_key=openai.api_key)
    prompt = (
        "You are a QA test documentation assistant.\n"
        "From the following section rules (Markdown), generate a single Markdown pipe table named 'Test Card' "
        "that lists one row per test. Do NOT include any text before or after the table.\n"
        "Requirements:\n"
        "- Columns: Test ID | Test Title | Procedures | Executed | Pass | Fail | Notes\n"
        "- 'Procedures' should be concise numbered steps separated by <br> (e.g., '1) ...<br>2) ...').\n"
        "- Leave 'Executed', 'Pass', and 'Fail' empty with a checkbox symbol (use '☐'). Do NOT tick anything.\n"
        "- Derive Tests from the 'Test Rules' content. Use short, content-based titles.\n"
        "- Output ONLY the table in GitHub-style pipe-table format.\n\n"
        f"=== SECTION NAME ===\n{section_name}\n\n"
        f"=== SECTION RULES (MARKDOWN) ===\n{rules_markdown}\n\n"
        "=== END ==="
    )
    result = llm([HumanMessage(content=prompt)])
    table_md = result.content.strip()

    # Safety: ensure it's a pipe table; if not, wrap with a minimal header
    if '|' not in table_md:
        header = "| Test ID | Test Title | Procedures | Executed | Pass | Fail | Notes |\n"
        sep    = "|---|---|---|---|---|---|---|\n"
        table_md = header + sep + "| 1 | (LLM failed to tabulate) | See rules above | ☐ | ☐ | ☐ | |\n"
    return table_md

def _sanitize_markdown(md: str) -> str:
    """
    Normalize some common LLM artifacts so Pandoc produces perfect lists/headings:
    - Convert emoji bullets to '-' bullets.
    - Ensure numbered lists like '1)' -> '1.' for consistency.
    - Trim excess whitespace.
    """
    # Replace common emoji bullets with hyphen bullets
    md = md.replace("🔹 ", "- ")
    md = md.replace("• ", "- ")
    md = md.replace("– ", "- ")

    # Normalize numbered lists that use ')'
    md = re.sub(r'^(\s*)(\d+)\)\s+', r'\1\2. ', md, flags=re.MULTILINE)

    # Avoid weird bold markers separated by spaces: ** bold ** -> **bold**
    md = re.sub(r'\*\*\s+(.*?)\s+\*\*', r'**\1**', md)

    return md.strip() + "\n"

def build_master_markdown(section_names, section_rule_reports, section_test_cards, pairwise_test_plans) -> str:
    """
    Build a single ordered Markdown document:
      - Title page
      - (Pandoc-generated TOC)
      - Each section's rules
      - '### Test Card' table (after each section)
      - Pairwise plan (if any)
    """
    parts = []
    parts.append("# MIL-STD-188 Automated Compliance Test Plan\n")
    parts.append("> Generated via automated extraction and synthesis pipeline.\n")
    parts.append("\n\\newpage\n")  # Pandoc page break

    # Serialize in the ORIGINAL section order (not the parallel completion order)
    for sec in section_names:
        rules_md = section_rule_reports.get(sec, "").strip()
        if rules_md:
            parts.append(_sanitize_markdown(rules_md))
            parts.append("\n")

        # Insert Test Card after rules
        test_card = section_test_cards.get(sec, "").strip()
        if test_card:
            parts.append("### Test Card\n\n")
            # Do not sanitize the table header/body (avoid mangling pipes)
            parts.append(test_card)
            parts.append("\n")

        # Then the pairwise plan (if any)
        if sec in pairwise_test_plans:
            combined_md = pairwise_test_plans[sec].strip()
            parts.append(_sanitize_markdown(combined_md))
            parts.append("\n")

        parts.append("\n\\newpage\n")

    return "\n".join(parts)

def write_docx_with_pandoc(markdown_text: str, output_docx_path: str, reference_docx: str = None, also_save_md: str = None):
    ensure_pandoc()

    if also_save_md:
        os.makedirs(os.path.dirname(also_save_md), exist_ok=True)
        with open(also_save_md, "w", encoding="utf-8") as f:
            f.write(markdown_text)

    import tempfile, pypandoc, subprocess

    # Helpful for debugging your environment:
    try:
        ver = subprocess.check_output(["pandoc", "-v"]).decode().splitlines()[0]
        print(f"[pandoc] {ver}")
    except Exception:
        pass

    # Write temp md
    with tempfile.NamedTemporaryFile(suffix=".md", delete=False, mode="w", encoding="utf-8") as tmp:
        tmp.write(markdown_text)
        tmp_md_path = tmp.name

    extra_args = ["--toc", "--toc-depth=3", "--number-sections"]
    if reference_docx:
        extra_args += ["--reference-doc", reference_docx]

    # Try more compatible formats first
    candidate_from_formats = [
        # Very compatible:
        "gfm+pipe_tables+autolink_bare_uris",
        # Plain markdown without the problematic extension:
        "markdown+pipe_tables+autolink_bare_uris",
        # Fenced code and basic extras:
        "markdown+fenced_code_blocks+autolink_bare_uris+pipe_tables",
    ]

    last_err = None
    for frm in candidate_from_formats:
        try:
            pypandoc.convert_file(
                tmp_md_path,
                to="docx",
                outputfile=output_docx_path,
                extra_args=extra_args,
                format=frm
            )
            print(f"Test plan saved to {output_docx_path} (from={frm})")
            return
        except RuntimeError as e:
            last_err = e
            print(f"[pandoc retry] failed with from={frm} -> {e}")

    # If all fail, surface the last error
    raise last_err


# ============================ MAIN ============================

if __name__ == "__main__":
    # 1) Extract sections (skip first 11 pages; 10 per section)
    sections, section_names = extract_sections_from_pdf(pdf_path, start_page=12, pages_per_section=10)

    # 2) PARALLEL: Extract detailed rules per section
    section_rule_results = {}
    with ThreadPoolExecutor(max_workers=8) as executor:
        futures = [executor.submit(extract_rules_for_section, name, text) for name, text in sections.items()]
        for fut in as_completed(futures):
            section_name, rules_md = fut.result()
            section_rule_results[section_name] = rules_md
            print(f"\n--- {section_name} ---\n{rules_md}\n{'='*60}")

    # 2.5) Build a Test Card table for each section (from its rules)
    section_test_cards = {}
    with ThreadPoolExecutor(max_workers=6) as executor:
        futures = [executor.submit(build_test_card_for_section, name, section_rule_results[name]) for name in section_names if name in section_rule_results]
        for fut in as_completed(futures):
            # We passed (section_name, rules) so we need to reconstruct mapping
            # The build function returns only the table; to keep order, we re-run mapping:
            # Simpler: capture section in closure by using a dict comp
            pass

    # Since capturing closures in executor loop can be clumsy, do it sequentially for clarity:
    section_test_cards = {}
    for sec in section_names:
        rules_md = section_rule_results.get(sec, "")
        if not rules_md:
            continue
        table_md = build_test_card_for_section(sec, rules_md)
        section_test_cards[sec] = table_md
        print(f"\n--- Test Card for {sec} ---\n{table_md}\n{'='*60}")

    # 3) PARALLEL: Create pairwise test plans for adjacent sections
    pairwise_test_plans = {}
    with ThreadPoolExecutor(max_workers=4) as executor:
        futures = []
        for i in range(len(section_names) - 1):
            s1 = section_names[i]
            s2 = section_names[i + 1]
            rules1 = section_rule_results.get(s1, "")
            rules2 = section_rule_results.get(s2, "")
            futures.append(executor.submit(synthesize_pairwise_test_plan, s1, rules1, rules2))
        for fut in as_completed(futures):
            pair_key, combined_md = fut.result()
            pairwise_test_plans[pair_key] = combined_md
            print(f"\n--- Pair: {pair_key} ---\n{combined_md}\n{'='*60}")

    # 4) Build a single ordered Markdown and convert with Pandoc
    master_md = build_master_markdown(section_names, section_rule_results, section_test_cards, pairwise_test_plans)
    write_docx_with_pandoc(
        master_md,
        output_docx_path=output_docx_path,
        reference_docx=None,            # Optional: supply a custom Word styles template
        also_save_md=output_markdown_path
    )



--- Section 1: Pages 12-21 ---
## General System and Interoperability Requirements for TADIL A Equipment

**Dependencies:**

1. **Referenced Documents Must Be Current and Applicable:**
   - Verify that all referenced standards, handbooks, and publications (listed in Sections 2.1–2.2) are the latest versions as specified in the Department of Defense Index of Specifications and Standards (DODISS) at time of solicitation or as otherwise specified.
   - Confirm that FED-STD-1037, MIL-STD-188-100, MIL-STD-188-114, MIL-STD-188-124, MIL-STD-461, MIL-STD-462, MIL-STD-1397, MIL-HDBK-232, MIL-HDBK-237, MIL-HDBK-241, STANAG 5511, NACSIM 5100, NACSEM 5200, and ITU Radio Regulations are available and referenced during design, development, and testing.
   - Confirm that message formats are verified according to JCS-PUB-10, not this document.

2. **Existing Facility Assessment:**
   - For existing TADIL A facilities, test for compliance only if undergoing major modification or rehabilitation. Otherw

In [None]:
import os
from PyPDF2 import PdfReader
from langchain.chat_models import ChatOpenAI
from langchain.schema import HumanMessage
from google.colab import userdata
from concurrent.futures import ThreadPoolExecutor, as_completed
from docx import Document

import openai
openai.api_key = userdata.get('OpenaAIAPI')
llm_model = "gpt-4.1"

# SECTION EXTRACTION
def extract_sections_from_pdf(pdf_path, start_page=12, pages_per_section=10):
    reader = PdfReader(pdf_path)
    num_pages = len(reader.pages)
    sections = {}
    section_idx = 1
    for i in range(start_page - 1, num_pages, pages_per_section):
        section_text = ""
        section_pages = []
        for j in range(i, min(i + pages_per_section, num_pages)):
            section_pages.append(j+1)
            section_text += (reader.pages[j].extract_text() or "") + "\n"
        section_title = f"Section {section_idx}: Pages {section_pages[0]}-{section_pages[-1]}"
        sections[section_title] = section_text.strip()
        section_idx += 1
    return sections

# DETAILED RULE EXTRACTION AGENT
def extract_rules_for_section(section_name, section_text):
    llm = ChatOpenAI(model_name=llm_model, openai_api_key=openai.api_key)
    prompt = (
        f"You are a MIL-STD-188 compliance and test planning expert.\n"
        f"Analyze the following section of a military standard and extract EVERY possible testable rule, specification, constraint, or requirement. "
        f"Rules MUST be extremely detailed, explicit, and step-by-step, and should include measurable criteria, acceptable ranges, and referenced figures or tables if mentioned. "
        f"For ambiguous or implicit requirements, describe a specific test strategy.\n"
        f"Generate a short, content-based TITLE for this section (do not use page numbers).\n"
        f"Organize your output as follows, using markdown headings and bolded text:\n\n"
        f"## [Section Title]\n"
        f"**Dependencies:**\n- List detailed dependencies as explicit tests, if any.\n\n"
        f"**Conflicts:**\n- List detected or possible conflicts and provide recommendations or mitigation steps.\n\n"
        f"**Test Rules:**\n1. (Very detailed, step-by-step numbered test rules)\n"
        f"\nSection Name: {section_name}\n\nSection Text:\n{section_text}\n\n"
        f"---\n"
        f"If you find truly nothing testable, reply: 'No testable rules in this section.'"
    )
    result = llm([HumanMessage(content=prompt)])
    return section_name, result.content

# PAIRWISE MINI-TEST-PLAN AGENT
def synthesize_pairwise_test_plan(pair_name, rules_text_1, rules_text_2):
    llm = ChatOpenAI(model_name=llm_model, openai_api_key=openai.api_key)
    prompt = (
        "You are a senior QA documentation engineer.\n"
        "Given the DETAILED test rules and extracted section titles for two consecutive MIL-STD-188 sections, synthesize a single, logically organized, highly detailed test plan section.\n"
        "Combine rules, merge similar steps, cross-reference overlapping content, and call out dependencies or conflicts. "
        "Use a single, **content-based TITLE** for this combined section (not using page numbers). "
        "Keep bold markdown headings for 'Dependencies', 'Conflicts', and 'Test Rules'.\n"
        "Test rules must be extremely explicit, step-by-step, and cover ALL possible technical details and verification steps.\n"
        "Format the output using markdown.\n\n"
        "=== SECTION 1 ===\n" + rules_text_1 + "\n\n=== SECTION 2 ===\n" + rules_text_2 +
        "\n\n=== END ===\n\nOutput ONLY the combined test plan in the described format."
    )
    result = llm([HumanMessage(content=prompt)])
    return pair_name, result.content

# MARKDOWN TO DOCX (REMOVING MARKDOWN)
def markdown_to_docx(markdown_text, doc):
    lines = markdown_text.split('\n')
    for line in lines:
        l = line.strip()
        if not l:
            continue
        if l.startswith("## "):
            doc.add_heading(l.replace("##", "").strip(), level=1)
        elif l.startswith("**") and l.endswith("**"):
            doc.add_heading(l.replace("*", "").strip(), level=2)
        elif l.startswith(("-", "*", "•")):
            doc.add_paragraph(l.lstrip("-*• ").strip(), style='List Bullet')
        elif l[:2].isdigit() and l[2] in ('.', ')'):
            doc.add_paragraph(l, style='List Number')
        elif "**" in l:
            parts = l.split("**")
            p = doc.add_paragraph()
            toggle = False
            for part in parts:
                run = p.add_run(part)
                if toggle:
                    run.bold = True
                toggle = not toggle
        else:
            doc.add_paragraph(l, style='BodyText')

# FINAL QA REPORT AGENT (DOCX)
def final_test_plan_docx(section_rule_reports, pairwise_test_plans, output_docx_path):
    doc = Document()
    doc.add_heading('MIL-STD-188 Automated Compliance Test Plan', 0)
    doc.add_page_break()

    # First pass: Collect titles from section_rule_reports and pairwise_test_plans for TOC
    toc_titles = []
    # Helper to get markdown section title
    def extract_title(markdown_content):
        for line in markdown_content.split('\n'):
            if line.startswith("## "):
                return line.replace("## ", "").strip()
        return None

    for section in section_rule_reports:
        section_title = extract_title(section_rule_reports[section])
        if section_title:
            toc_titles.append(section_title)
        # After each section, insert its pairwise plan if it exists
        pair_key = section
        if pair_key in pairwise_test_plans:
            pair_title = extract_title(pairwise_test_plans[pair_key])
            if pair_title:
                toc_titles.append(pair_title)

    doc.add_heading('Table of Contents', level=1)
    for idx, title in enumerate(toc_titles, 1):
        doc.add_paragraph(f"{idx}. {title}", style='List Number')
    doc.add_page_break()

    # Second pass: Write detailed section report, then its pairwise test plan (if any)
    for section in section_rule_reports:
        markdown_to_docx(section_rule_reports[section], doc)
        pair_key = section
        if pair_key in pairwise_test_plans:
            markdown_to_docx(pairwise_test_plans[pair_key], doc)
    doc.save(output_docx_path)
    print(f"Test plan saved to {output_docx_path}")

# ========== MAIN PIPELINE ==========
if __name__ == "__main__":
    pdf_path = "/GD/My Drive/JITC/military_standards_pdf/MIL-STD-188-203-1A_1995.pdf"
    output_docx_path = "/GD/My Drive/JITC/military_standards_pdf/MIL-STD-188-Auto-TestPlan.docx"

    # 1. Extract sections (skip first 11 pages, 10 per section, ordered)
    sections = extract_sections_from_pdf(pdf_path, start_page=12, pages_per_section=10)
    section_names = list(sections.keys())  # Preserve order

    # 2. PARALLEL: Extract detailed rules per section (with content-based titles)
    section_rule_results = {}
    with ThreadPoolExecutor(max_workers=8) as executor:
        futures = [
            executor.submit(extract_rules_for_section, section_name, section_text)
            for section_name, section_text in sections.items()
        ]
        for future in as_completed(futures):
            section_name, rules = future.result()
            section_rule_results[section_name] = rules
            print(f"\n--- {section_name} ---\n{rules}\n{'='*60}")

    # 3. PARALLEL: For each section, create a pairwise test plan with the next section (not cross-pairs)
    pairwise_test_plans = {}
    section_keys = list(section_rule_results.keys())
    with ThreadPoolExecutor(max_workers=4) as executor:
        futures = []
        for i in range(len(section_keys) - 1):
            s1 = section_keys[i]
            s2 = section_keys[i + 1]
            rules1 = section_rule_results[s1]
            rules2 = section_rule_results[s2]
            pair_name = s1  # Use first section's key as the pair key
            futures.append(executor.submit(synthesize_pairwise_test_plan, pair_name, rules1, rules2))
        for future in as_completed(futures):
            pair_name, combined_plan = future.result()
            pairwise_test_plans[pair_name] = combined_plan
            print(f"\n--- Pair: {pair_name} ---\n{combined_plan}\n{'='*60}")

    # 4. Final review, formatting, and DOCX creation (with all reports serialized)
    final_test_plan_docx(section_rule_results, pairwise_test_plans, output_docx_path)


  llm = ChatOpenAI(model_name=llm_model, openai_api_key=openai.api_key)
  result = llm([HumanMessage(content=prompt)])



--- Section 1: Pages 12-21 ---
## General Compliance and Interoperability Requirements for TADIL A Systems

**Dependencies:**
- Verify the availability and correct editions of all referenced documents, including MIL-STD-188 series standards, FED-STD-1037, relevant MIL-HDBKs, STANAG 5511, NACSIM/NACSEM documents, and ITU Radio Regulations.
- Ensure access to TADIL A message format definitions in JCS-PUB-10 (though not part of this standard), as correct message formatting is essential for interoperability.
- Confirm applicability and compliance with MIL-STD-188-100 when TADIL A is used over common long-haul and tactical circuits.
- Ensure current Department of Defense procurement regulations and DODD 4630.5 are available and understood for determining necessary compliance actions.

**Conflicts:**
- In the event of a conflict between this standard and any referenced document, the text of this standard takes precedence (Test: Compare and resolve discrepancies; document and justify deviati

  return self._get_style_id_from_style(self[style_name], style_type)


Test plan saved to /GD/My Drive/JITC/military_standards_pdf/MIL-STD-188-Auto-TestPlan.docx


# Section-wise Test Plan

In [None]:
import os
import re
from PyPDF2 import PdfReader
from langchain.chat_models import ChatOpenAI
from langchain.schema import HumanMessage
from google.colab import userdata, files
from concurrent.futures import ThreadPoolExecutor, as_completed
from docx import Document
import chromadb
from sentence_transformers import SentenceTransformer
import openai

openai.api_key = userdata.get('OpenaAIAPI')

# ========== CONFIGURATION ==========
llm_model_variants = [
    "gpt-4.1",        # Main actor
    "gpt-4o",         # Second actor
    "gpt-4-turbo",    # Third actor
]
critic_model = "gpt-4.1"
final_critic_model = "gpt-4.1"

embedding_model = SentenceTransformer('multi-qa-mpnet-base-dot-v1')
chroma_collection_name = "military_standards"
chroma_client = chromadb.PersistentClient(path="/GD/My Drive/JITC/military_standards")

# ========== DEDUPLICATION HELPERS ==========
def deduplicate_sentences_in_markdown(text):
    """
    Deduplicate sentences within each markdown section heading (##, **...**).
    Preserves unique sentences and formatting, works at the sentence level for better precision.
    """
    output = []
    section_boundary = lambda l: l.startswith("## ") or (l.startswith("**") and l.endswith("**"))

    def process_block(block):
        local_seen = set()
        for sentence in re.split(r'(?<=[.!?]) +', block):
            sent = sentence.strip()
            norm = re.sub(r'\s+', ' ', sent.lower())
            if not sent or norm in local_seen:
                continue
            output.append(sent)
            local_seen.add(norm)

    current_block = []
    for line in text.split('\n'):
        if section_boundary(line):
            process_block(' '.join(current_block))
            current_block = []
            output.append(line)
        elif line.strip() == "":
            process_block(' '.join(current_block))
            current_block = []
            output.append(line)
        else:
            current_block.append(line.strip())
    process_block(' '.join(current_block))
    return '\n'.join(output)

def final_global_deduplicate(text):
    """
    Remove duplicate lines and sentences across the entire text (global scope).
    Ensures that no repeated line or sentence appears, even across sections.
    """
    seen = set()
    out = []
    for line in text.split('\n'):
        # Optionally, split long lines into sentences for finer deduplication
        sentences = re.split(r'(?<=[.!?]) +', line) if len(line) > 120 else [line]
        unique_sentences = []
        for s in sentences:
            norm = re.sub(r'\s+', ' ', s.strip().lower())
            if norm and norm not in seen:
                unique_sentences.append(s)
                seen.add(norm)
            elif not s.strip():
                unique_sentences.append(s)  # preserve blanks
        joined = ' '.join(unique_sentences).strip()
        if joined or not line.strip():
            out.append(joined)
    return '\n'.join(out)

# ========== ChromaDB Section Retrieval ==========
def search_chromadb_sections(chroma_client, chroma_collection_name, query_text, top_k=3):
    collection = chroma_client.get_collection(chroma_collection_name)
    embedding = embedding_model.encode([query_text])[0].tolist()
    results = collection.query(
        query_embeddings=[embedding],
        n_results=top_k,
        include=['documents', 'metadatas']
    )
    hits = []
    for doc, meta in zip(results['documents'][0], results['metadatas'][0]):
        hits.append({"document": doc, "metadata": meta})
    return hits

def select_section_semantically(chroma_client, chroma_collection_name, user_query, top_k=3):
    results = search_chromadb_sections(chroma_client, chroma_collection_name, user_query, top_k)
    print("\nTop semantic matches for your query:")
    for idx, hit in enumerate(results):
        meta = hit['metadata']
        print(f"{idx+1}. Document: {meta['document_name']}, Page: {meta['page']}")
        print(hit['document'][:400] + '\n---')
    while True:
        try:
            chosen_idx = int(input(f"Select the number of the section you want to use (1-{len(results)}): ")) - 1
            if 0 <= chosen_idx < len(results):
                return results[chosen_idx]['document'], results[chosen_idx]['metadata']
            print("Invalid input. Please enter a number from the list.")
        except Exception:
            print("Invalid input. Please enter a valid number.")

# ========== ACTOR AGENT ==========
def extract_rules_with_llm(section_name, section_text, llm_model):
    llm = ChatOpenAI(model_name=llm_model, openai_api_key=openai.api_key)
    prompt = (
        f"You are a MIL-STD-188 compliance and test planning expert.\n"
        f"Analyze the following section of a military standard and extract EVERY possible testable rule, specification, constraint, or requirement. "
        f"Rules MUST be extremely detailed, explicit, and step-by-step, and should include measurable criteria, acceptable ranges, and referenced figures or tables if mentioned. "
        f"For ambiguous or implicit requirements, describe a specific test strategy.\n"
        f"Generate a short, content-based TITLE for this section (do not use page numbers).\n"
        f"Organize your output as follows, using markdown headings and bolded text.\n"
        f"**ABSOLUTELY DO NOT REPEAT, DUPLICATE, OR PARAPHRASE THE SAME RULE OR LINE. Each requirement, dependency, and test step must appear ONCE ONLY.**\n"
        f"## [Section Title]\n"
        f"**Dependencies:**\n- List detailed dependencies as explicit tests, if any.\n\n"
        f"**Conflicts:**\n- List detected or possible conflicts and provide recommendations or mitigation steps.\n\n"
        f"**Test Rules:**\n1. (Very detailed, step-by-step numbered test rules)\n"
        f"\nSection Name: {section_name}\n\nSection Text:\n{section_text}\n\n"
        f"---\n"
        f"If you find truly nothing testable, reply: 'No testable rules in this section.'"
    )
    result = llm([HumanMessage(content=prompt)])
    return llm_model, result.content

# ========== CRITIC AGENT ==========
def critic_review_rules(section_name, section_text, actor_outputs, critic_model, feedback_from_final_critic=None):
    llm = ChatOpenAI(model_name=critic_model, openai_api_key=openai.api_key)
    prompt = (
        f"You are a senior MIL-STD-188 test planning reviewer (Critic AI).\n"
        f"Given the following section and rules extracted by several different LLMs, do the following:\n"
        f"1. Carefully review and compare the provided rule sets.\n"
        f"2. Synthesize a SINGLE, detailed and explicit set of testable rules.\n"
        f"3. Eliminate redundancies, correct errors, and ensure all requirements are present.\n"
        f"4. Ensure the final test plan is step-by-step, detailed, and well organized.\n"
        f"**NEVER simply combine all lines verbatim—synthesize, deduplicate, and streamline the content into a concise, non-repetitive format. If a rule, step, or line has the same or similar meaning as another, KEEP ONLY ONE.**\n"
        f"Present your result in markdown format with these headings: '## [Section Title]', '**Dependencies:**', '**Conflicts:**', '**Test Rules:**'\n"
        f"\nSection Name: {section_name}\n\nSection Text:\n{section_text}\n"
        f"\n---\n"
        f"LLM Outputs:\n" + "\n\n---\n".join([f"Model {k}:\n{v}" for k, v in actor_outputs.items()])
    )
    if feedback_from_final_critic:
        prompt += (
            "\n\nThe final reviewer found issues with your previous plan. Please revise your plan to address these specific comments:\n"
            f"{feedback_from_final_critic}\n"
        )
    result = llm([HumanMessage(content=prompt)])
    return result.content

# ========== FINAL CRITIC AGENT ==========
def final_critic_check(section_name, section_text, final_test_plan, user_query, critic_model):
    llm = ChatOpenAI(model_name=critic_model, openai_api_key=openai.api_key)
    prompt = (
        f"You are a final Critic AI reviewing a MIL-STD-188 test plan section for completeness and relevance.\n"
        f"1. Ensure that the correct section was selected and fully processed (user query below).\n"
        f"2. Review the synthesized test plan for completeness, technical accuracy, and structure.\n"
        f"3. If there are any omissions, structural issues, or mismatches with the section, list them and recommend corrections.\n"
        f"4. Confirm that all rules and steps are logically derived from the section text.\n"
        f"Summarize your review at the top as 'Final Critic Review: [Summary]'. "
        f"Indicate clearly 'PASS' if everything is correct, or 'FAIL' and the reason(s) if not. "
        f"For section errors, specify if the section(s) retrieved are incorrect.\n\n"
        f"User Query: {user_query}\n"
        f"\nSection Name: {section_name}\n\nSection Text:\n{section_text}\n\n"
        f"Synthesized Test Plan:\n{final_test_plan}\n"
    )
    result = llm([HumanMessage(content=prompt)]).content
    status = "PASS" if result.strip().upper().startswith("FINAL CRITIC REVIEW: PASS") or "PASS" in result.split('\n')[0].upper() else "FAIL"
    return status, result

# ========== SAFE MARKDOWN TO DOCX ==========
def markdown_to_docx(markdown_text, doc):
    def safe_add_paragraph(text, style=None):
        try:
            return doc.add_paragraph(text, style=style) if style else doc.add_paragraph(text)
        except KeyError:
            return doc.add_paragraph(text)
    lines = markdown_text.split('\n')
    for line in lines:
        l = line.strip()
        if not l:
            continue
        if l.startswith("## "):
            try:
                doc.add_heading(l.replace("##", "").strip(), level=1)
            except:
                safe_add_paragraph(l.replace("##", "").strip())
        elif l.startswith("**") and l.endswith("**"):
            try:
                doc.add_heading(l.replace("*", "").strip(), level=2)
            except:
                safe_add_paragraph(l.replace("*", "").strip())
        elif l.startswith(("-", "*", "•")):
            safe_add_paragraph(l.lstrip("-*• ").strip(), 'List Bullet')
        elif l[:2].isdigit() and l[2] in ('.', ')'):
            safe_add_paragraph(l, 'List Number')
        elif "**" in l:
            parts = l.split("**")
            p = doc.add_paragraph()
            toggle = False
            for part in parts:
                run = p.add_run(part)
                if toggle:
                    run.bold = True
                toggle = not toggle
        else:
            safe_add_paragraph(l)

# ========== DOCX GENERATION ==========
def test_plan_docx_from_template(template_docx_path, section_name, section_rule_report, final_critic_report, output_docx_path):
    doc = Document(template_docx_path)
    doc.add_page_break()
    try:
        doc.add_heading('Automated Compliance Test Plan', 0)
    except:
        doc.add_paragraph('Automated Compliance Test Plan')
    try:
        doc.add_paragraph(f"Section: {section_name}", style='Heading 1')
    except KeyError:
        doc.add_paragraph(f"Section: {section_name}")
    markdown_to_docx(section_rule_report, doc)
    try:
        doc.add_heading('Final Critic Review', level=1)
    except:
        doc.add_paragraph('Final Critic Review')
    doc.add_paragraph(final_critic_report)
    doc.save(output_docx_path)
    print(f"Test plan for {section_name} saved to {output_docx_path}")

# ========== MAIN PIPELINE ==========
if __name__ == "__main__":
    print("Please upload your DOCX template.")
    uploaded = files.upload()
    template_docx_path = None
    for filename in uploaded.keys():
        if filename.lower().endswith('.docx'):
            template_docx_path = filename
            break
    if template_docx_path is None:
        raise ValueError("No DOCX template was uploaded.")

    user_query = input("Describe what section or topic you want to generate a test plan for (semantic query): ")

    section_confirmed = False
    while not section_confirmed:
        # 1. Use ChromaDB semantic retrieval for candidate section(s)
        section_text, section_metadata = select_section_semantically(
            chroma_client, chroma_collection_name, user_query, top_k=3
        )
        section_name = f"{section_metadata['document_name']} (Page {section_metadata['page']})"

        process_completed = False
        while not process_completed:
            print(f"\n=== Processing {section_name} ===\n")

            # 2. Parallel LLM Actors
            actor_outputs = {}
            with ThreadPoolExecutor(max_workers=len(llm_model_variants)) as executor:
                futures = [
                    executor.submit(extract_rules_with_llm, section_name, section_text, m)
                    for m in llm_model_variants
                ]
                for future in as_completed(futures):
                    model_name, actor_output = future.result()
                    actor_outputs[model_name] = actor_output

            print("\n--- Actor Outputs ---\n")
            for k, v in actor_outputs.items():
                print(f"\nModel {k} output:\n{v}\n{'='*40}")

            # 3. Critic AI loop (for feedback correction)
            critic_feedback = None
            critic_done = False
            while not critic_done:
                critic_output = critic_review_rules(
                    section_name, section_text, actor_outputs, critic_model,
                    feedback_from_final_critic=critic_feedback
                )
                # =========== SMART SENTENCE-LEVEL DEDUPLICATION ===========
                critic_output = deduplicate_sentences_in_markdown(critic_output)
                # =========== FINAL GLOBAL DEDUPLICATION ===========
                critic_output = final_global_deduplicate(critic_output)

                # 4. Final Critic AI
                status, final_critic_output = final_critic_check(
                    section_name, section_text, critic_output, user_query, final_critic_model
                )
                # Also deduplicate the final critic review (optional)
                final_critic_output = final_global_deduplicate(final_critic_output)

                print(f"\n--- Final Critic Review ---\n{final_critic_output}\n{'='*60}")

                if status == "PASS":
                    output_docx_path = f"Auto-TestPlan_{section_name.replace(':', '').replace(' ', '_')}.docx"
                    test_plan_docx_from_template(
                        template_docx_path, section_name, critic_output, final_critic_output, output_docx_path
                    )
                    print(f"\nCompleted: {output_docx_path}")
                    process_completed = True
                    critic_done = True
                    section_confirmed = True
                else:
                    # If it's a section retrieval error, restart ChromaDB semantic selection
                    if any(err in final_critic_output.lower() for err in ["wrong section", "incorrect section", "section retrieval"]):
                        print("\n[!] Final Critic flagged incorrect section. Let's try semantic selection again.\n")
                        section_confirmed = False
                        break
                    else:
                        print("\n[!] Final Critic flagged test plan issues. Looping back to Critic AI with feedback.\n")
                        critic_feedback = final_critic_output


Please upload your DOCX template.


Saving test_plan_template.docx to test_plan_template.docx
Describe what section or topic you want to generate a test plan for (semantic query): Transmission Frame Structure

Top semantic matches for your query:
1. Document: MIL-STD-188-203-1A_1995.pdf, Page: 27
15MIL-STD-188-203-1A 
 
 
 
 
FIGURE 4.  Transmission frame s tructure .
[Image 1]: 
# Description:
Figure: Frame‐level structure of DNCS (Digital Network Control Station) and picket communications showing all major message types (A–H) and the sequence of constituent fields.
A. Roll Call / DNCS Invocation (RW)—5-frame Preamble, 1-frame Phase Reference, 2-frame Address.
B. Roll Call / Picket Reply 
---
2. Document: MIL-STD-188-203-1A_1995.pdf, Page: 97
85MIL-STD-188-203-1A 
 
APPENDIX  D2
[Image 1]: 
# Description:
Figure 23. Transmit‐Address Timing

From top to bottom, the timing diagram shows:
1. Frame line (logic 0 = low, logic 1 = high)
2. Incoming data line (logic 0 = low, logic 1 = high)
3. Address data line (logic 0 = low,

In [None]:
import os
import re
from langchain.chat_models import ChatOpenAI
from langchain.schema import HumanMessage
from google.colab import userdata, files
from concurrent.futures import ThreadPoolExecutor, as_completed
import chromadb
from sentence_transformers import SentenceTransformer
import openai

openai.api_key = userdata.get('OpenaAIAPI')

# ========== CONFIGURATION ==========
llm_model_variants = [
    "gpt-4.1",        # Main actor
    "gpt-4o",         # Second actor
    "gpt-4-turbo",    # Third actor
]
critic_model = "gpt-4.1"
final_critic_model = "gpt-4.1"

embedding_model = SentenceTransformer('multi-qa-mpnet-base-dot-v1')
chroma_collection_name = "military_standards"
chroma_client = chromadb.PersistentClient(path="/GD/My Drive/JITC/military_standards")

# ========== DEDUPLICATION HELPERS ==========
def deduplicate_sentences_in_markdown(text):
    """
    Deduplicate sentences within each markdown section heading (##, **...**).
    Preserves unique sentences and formatting, works at the sentence level for better precision.
    """
    output = []
    section_boundary = lambda l: l.startswith("## ") or (l.startswith("**") and l.endswith("**"))

    def process_block(block):
        local_seen = set()
        for sentence in re.split(r'(?<=[.!?]) +', block):
            sent = sentence.strip()
            norm = re.sub(r'\s+', ' ', sent.lower())
            if not sent or norm in local_seen:
                continue
            output.append(sent)
            local_seen.add(norm)

    current_block = []
    for line in text.split('\n'):
        if section_boundary(line):
            process_block(' '.join(current_block))
            current_block = []
            output.append(line)
        elif line.strip() == "":
            process_block(' '.join(current_block))
            current_block = []
            output.append(line)
        else:
            current_block.append(line.strip())
    process_block(' '.join(current_block))
    return '\n'.join(output)

def final_global_deduplicate(text):
    """
    Remove duplicate lines and sentences across the entire text (global scope).
    Ensures that no repeated line or sentence appears, even across sections.
    """
    seen = set()
    out = []
    for line in text.split('\n'):
        sentences = re.split(r'(?<=[.!?]) +', line) if len(line) > 120 else [line]
        unique_sentences = []
        for s in sentences:
            norm = re.sub(r'\s+', ' ', s.strip().lower())
            if norm and norm not in seen:
                unique_sentences.append(s)
                seen.add(norm)
            elif not s.strip():
                unique_sentences.append(s)  # preserve blanks
        joined = ' '.join(unique_sentences).strip()
        if joined or not line.strip():
            out.append(joined)
    return '\n'.join(out)

# ========== ChromaDB Section Retrieval ==========
def search_chromadb_sections(chroma_client, chroma_collection_name, query_text, top_k=3):
    collection = chroma_client.get_collection(chroma_collection_name)
    embedding = embedding_model.encode([query_text])[0].tolist()
    results = collection.query(
        query_embeddings=[embedding],
        n_results=top_k,
        include=['documents', 'metadatas']
    )
    hits = []
    for doc, meta in zip(results['documents'][0], results['metadatas'][0]):
        hits.append({"document": doc, "metadata": meta})
    return hits

def select_section_semantically(chroma_client, chroma_collection_name, user_query, top_k=3):
    results = search_chromadb_sections(chroma_client, chroma_collection_name, user_query, top_k)
    print("\nTop semantic matches for your query:")
    for idx, hit in enumerate(results):
        meta = hit['metadata']
        print(f"{idx+1}. Document: {meta['document_name']}, Page: {meta['page']}")
        print(hit['document'][:400] + '\n---')
    while True:
        try:
            chosen_idx = int(input(f"Select the number of the section you want to use (1-{len(results)}): ")) - 1
            if 0 <= chosen_idx < len(results):
                return results[chosen_idx]['document'], results[chosen_idx]['metadata']
            print("Invalid input. Please enter a number from the list.")
        except Exception:
            print("Invalid input. Please enter a valid number.")

# ========== ACTOR AGENT ==========
def extract_rules_with_llm(section_name, section_text, llm_model):
    llm = ChatOpenAI(model_name=llm_model, openai_api_key=openai.api_key)
    prompt = (
        f"You are a MIL-STD-188 compliance and test planning expert.\n"
        f"Analyze the following section of a military standard and extract EVERY possible testable rule, specification, constraint, or requirement. "
        f"Rules MUST be extremely detailed, explicit, and step-by-step, and should include measurable criteria, acceptable ranges, and referenced figures or tables if mentioned. "
        f"For ambiguous or implicit requirements, describe a specific test strategy.\n"
        f"Generate a short, content-based TITLE for this section (do not use page numbers).\n"
        f"Organize your output as follows, using markdown headings and bolded text.\n"
        f"**ABSOLUTELY DO NOT REPEAT, DUPLICATE, OR PARAPHRASE THE SAME RULE OR LINE. Each requirement, dependency, and test step must appear ONCE ONLY.**\n"
        f"## [Section Title]\n"
        f"**Dependencies:**\n- List detailed dependencies as explicit tests, if any.\n\n"
        f"**Conflicts:**\n- List detected or possible conflicts and provide recommendations or mitigation steps.\n\n"
        f"**Test Rules:**\n1. (Very detailed, step-by-step numbered test rules)\n"
        f"\nSection Name: {section_name}\n\nSection Text:\n{section_text}\n\n"
        f"---\n"
        f"If you find truly nothing testable, reply: 'No testable rules in this section.'"
    )
    result = llm([HumanMessage(content=prompt)])
    return llm_model, result.content

# ========== CRITIC AGENT ==========
def critic_review_rules(section_name, section_text, actor_outputs, critic_model, feedback_from_final_critic=None):
    llm = ChatOpenAI(model_name=critic_model, openai_api_key=openai.api_key)
    prompt = (
        f"You are a senior MIL-STD-188 test planning reviewer (Critic AI).\n"
        f"Given the following section and rules extracted by several different LLMs, do the following:\n"
        f"1. Carefully review and compare the provided rule sets.\n"
        f"2. Synthesize a SINGLE, detailed and explicit set of testable rules.\n"
        f"3. Eliminate redundancies, correct errors, and ensure all requirements are present.\n"
        f"4. Ensure the final test plan is step-by-step, detailed, and well organized.\n"
        f"**NEVER simply combine all lines verbatim—synthesize, deduplicate, and streamline the content into a concise, non-repetitive format. If a rule, step, or line has the same or similar meaning as another, KEEP ONLY ONE.**\n"
        f"Present your result in markdown format with these headings: '## [Section Title]', '**Dependencies:**', '**Conflicts:**', '**Test Rules:**'\n"
        f"\nSection Name: {section_name}\n\nSection Text:\n{section_text}\n"
        f"\n---\n"
        f"LLM Outputs:\n" + "\n\n---\n".join([f"Model {k}:\n{v}" for k, v in actor_outputs.items()])
    )
    if feedback_from_final_critic:
        prompt += (
            "\n\nThe final reviewer found issues with your previous plan. Please revise your plan to address these specific comments:\n"
            f"{feedback_from_final_critic}\n"
        )
    result = llm([HumanMessage(content=prompt)])
    return result.content

# ========== FINAL CRITIC AGENT ==========
def final_critic_check(section_name, section_text, final_test_plan, user_query, critic_model):
    llm = ChatOpenAI(model_name=critic_model, openai_api_key=openai.api_key)
    prompt = (
        f"You are a final Critic AI reviewing a MIL-STD-188 test plan section for completeness and relevance.\n"
        f"1. Ensure that the correct section was selected and fully processed (user query below).\n"
        f"2. Review the synthesized test plan for completeness, technical accuracy, and structure.\n"
        f"3. If there are any omissions, structural issues, or mismatches with the section, list them and recommend corrections.\n"
        f"4. Confirm that all rules and steps are logically derived from the section text.\n"
        f"Summarize your review at the top as 'Final Critic Review: [Summary]'. "
        f"Indicate clearly 'PASS' if everything is correct, or 'FAIL' and the reason(s) if not. "
        f"For section errors, specify if the section(s) retrieved are incorrect.\n\n"
        f"User Query: {user_query}\n"
        f"\nSection Name: {section_name}\n\nSection Text:\n{section_text}\n\n"
        f"Synthesized Test Plan:\n{final_test_plan}\n"
    )
    result = llm([HumanMessage(content=prompt)]).content
    status = "PASS" if result.strip().upper().startswith("FINAL CRITIC REVIEW: PASS") or "PASS" in result.split('\n')[0].upper() else "FAIL"
    return status, result

# ========== TEXT FILE GENERATION ==========
def write_testplan_txt(output_txt_path, section_name, critic_output, final_critic_output):
    with open(output_txt_path, "w", encoding="utf-8") as f:
        f.write("# Automated Compliance Test Plan\n\n")
        f.write(f"Section: {section_name}\n\n")
        f.write(critic_output)
        f.write("\n\n=== Final Critic Review ===\n")
        f.write(final_critic_output)
    print(f"Test plan for {section_name} saved to {output_txt_path}")

# ========== MAIN PIPELINE ==========
if __name__ == "__main__":
    print("Please upload your DOCX template (for context, but output will be .txt).")
    uploaded = files.upload()
    template_docx_path = None
    for filename in uploaded.keys():
        if filename.lower().endswith('.docx'):
            template_docx_path = filename
            break
    if template_docx_path is None:
        print("Warning: No DOCX template was uploaded. Proceeding without docx template context.")

    user_query = input("Describe what section or topic you want to generate a test plan for (semantic query): ")

    section_confirmed = False
    while not section_confirmed:
        # 1. Use ChromaDB semantic retrieval for candidate section(s)
        section_text, section_metadata = select_section_semantically(
            chroma_client, chroma_collection_name, user_query, top_k=3
        )
        section_name = f"{section_metadata['document_name']} (Page {section_metadata['page']})"

        process_completed = False
        while not process_completed:
            print(f"\n=== Processing {section_name} ===\n")

            # 2. Parallel LLM Actors
            actor_outputs = {}
            with ThreadPoolExecutor(max_workers=len(llm_model_variants)) as executor:
                futures = [
                    executor.submit(extract_rules_with_llm, section_name, section_text, m)
                    for m in llm_model_variants
                ]
                for future in as_completed(futures):
                    model_name, actor_output = future.result()
                    actor_outputs[model_name] = actor_output

            print("\n--- Actor Outputs ---\n")
            for k, v in actor_outputs.items():
                print(f"\nModel {k} output:\n{v}\n{'='*40}")

            # 3. Critic AI loop (for feedback correction)
            critic_feedback = None
            critic_done = False
            while not critic_done:
                critic_output = critic_review_rules(
                    section_name, section_text, actor_outputs, critic_model,
                    feedback_from_final_critic=critic_feedback
                )
                # =========== SMART SENTENCE-LEVEL DEDUPLICATION ===========
                critic_output = deduplicate_sentences_in_markdown(critic_output)
                # =========== FINAL GLOBAL DEDUPLICATION ===========
                critic_output = final_global_deduplicate(critic_output)

                # 4. Final Critic AI
                status, final_critic_output = final_critic_check(
                    section_name, section_text, critic_output, user_query, final_critic_model
                )
                # Deduplicate the final critic review as well (optional but recommended)
                final_critic_output = final_global_deduplicate(final_critic_output)

                print(f"\n--- Final Critic Review ---\n{final_critic_output}\n{'='*60}")

                if status == "PASS":
                    output_txt_path = f"Auto-TestPlan_{section_name.replace(':', '').replace(' ', '_')}.txt"
                    write_testplan_txt(
                        output_txt_path, section_name, critic_output, final_critic_output
                    )
                    print(f"\nCompleted: {output_txt_path}")
                    process_completed = True
                    critic_done = True
                    section_confirmed = True
                else:
                    # If it's a section retrieval error, restart ChromaDB semantic selection
                    if any(err in final_critic_output.lower() for err in ["wrong section", "incorrect section", "section retrieval"]):
                        print("\n[!] Final Critic flagged incorrect section. Let's try semantic selection again.\n")
                        section_confirmed = False
                        break
                    else:
                        print("\n[!] Final Critic flagged test plan issues. Looping back to Critic AI with feedback.\n")
                        critic_feedback = final_critic_output


Please upload your DOCX template (for context, but output will be .txt).


Saving test_plan_template.docx to test_plan_template (1).docx
Describe what section or topic you want to generate a test plan for (semantic query): Transmission Frame

Top semantic matches for your query:
1. Document: MIL-STD-188-203-1A_1995.pdf, Page: 27
15MIL-STD-188-203-1A 
 
 
 
 
FIGURE 4.  Transmission frame s tructure .
[Image 1]: 
# Description:
Figure: Frame‐level structure of DNCS (Digital Network Control Station) and picket communications showing all major message types (A–H) and the sequence of constituent fields.
A. Roll Call / DNCS Invocation (RW)—5-frame Preamble, 1-frame Phase Reference, 2-frame Address.
B. Roll Call / Picket Reply 
---
2. Document: MIL-STD-188-203-1A_1995.pdf, Page: 97
85MIL-STD-188-203-1A 
 
APPENDIX  D2
[Image 1]: 
# Description:
Figure 23. Transmit‐Address Timing

From top to bottom, the timing diagram shows:
1. Frame line (logic 0 = low, logic 1 = high)
2. Incoming data line (logic 0 = low, logic 1 = high)
3. Address data line (logic 0 = low, logic

  llm = ChatOpenAI(model_name=llm_model, openai_api_key=openai.api_key)
  result = llm([HumanMessage(content=prompt)])



--- Actor Outputs ---


Model gpt-4o output:
## Transmission Frame Structure

**Dependencies:**
- Transmission frame tests require understanding of DNCS (Digital Network Control Station) and picket communication protocols.
- Variable-length Data fields necessitate dynamic test cases that can handle different data sizes.
- Manual initiation and reset actions require human intervention or automation tools set up to simulate manual operations.

**Conflicts:**
- Potential conflict between manual initiation and automation testing: Ensure that automation tools can simulate manual initiation accurately.

**Test Rules:**

1. **Roll Call/DNCS Invocation (RW) Test:**
   - Verify the presence of a 5-frame Preamble at the start of the message.
   - Confirm a 1-frame Phase Reference immediately follows the Preamble.
   - Check for a 2-frame Address field succeeding the Phase Reference.

2. **Roll Call/Picket Reply Message (PRM) Test:**
   - Validate the 5-frame Preamble is correctly formatted at t

# Pypandoc + Test Card + Vector DB

In [None]:
import os
import re
import json
import tempfile
import subprocess
from dataclasses import dataclass
from typing import Dict, List, Tuple, Optional
from concurrent.futures import ThreadPoolExecutor, as_completed

# ---------- PDF & I/O ----------
from PyPDF2 import PdfReader
import pypandoc

# ---------- LLM (OpenAI via LangChain for your rules/plan/test-card) ----------
from langchain.chat_models import ChatOpenAI
from langchain.schema import HumanMessage

# ---------- Colab secret helper ----------
try:
    from google.colab import userdata  # type: ignore
except Exception:  # pragma: no cover
    userdata = None

# ---------- Vector DB + embeddings ----------
import chromadb
from sentence_transformers import SentenceTransformer

# ---------- Images & PDF visuals (PyMuPDF) ----------
import numpy as np
from PIL import Image

try:
    import fitz  # PyMuPDF
except Exception:
    fitz = None

# Optional OCR
try:
    import pytesseract  # type: ignore
except Exception:
    pytesseract = None

try:
    from pdf2image import convert_from_path  # type: ignore
except Exception:
    convert_from_path = None

# ---------- MarkItDown for image description (OpenAI multimodal) ----------
from openai import OpenAI
from markitdown import MarkItDown


# ============================ CONFIG ============================

# OpenAI key (colab or env)
import openai
openai.api_key = userdata.get('OpenaAIAPI') if userdata else os.environ.get("OPENAI_API_KEY", "")
os.environ["OPENAI_API_KEY"] = openai.api_key or os.environ.get("OPENAI_API_KEY", "")

# Model for rules / pairwise / test-cards
llm_model = "gpt-4.1"

# PDF & output paths (from your original code)
pdf_path = "/GD/My Drive/JITC/military_standards_pdf/MIL-STD-188-203-1A_1995.pdf"
output_docx_path = "/GD/My Drive/JITC/military_standards_pdf/MIL-STD-188-Auto-TestPlan.docx"
output_markdown_path = output_docx_path.replace(".docx", ".md")  # save master .md too

# Vector DB locations derived from the same folder as the PDF
folder_path = os.path.dirname(pdf_path)
chroma_path = os.path.join(folder_path, "chroma_store")       # persistent chroma
collection_name = "military_standards"                        # you can rename

# Ingestion knobs
EMBED_MODEL = "multi-qa-mpnet-base-dot-v1"
MARKITDOWN_MODEL = "o4-mini"
DESCRIBE_IMAGES = True
MAX_IMAGES_PER_PAGE = None
OCR_ENABLED = False
OCR_DPI = 220
PAGE_RASTER_DPI = 144
PROBE_DPI = 72
PROBE_NONWHITE_THRESHOLD = 0.01
DETECT_INLINE_BLOCKS = True
DETECT_DRAWINGS = True
MIN_DRAWING_AREA = 2000.0
RASTERIZE_WHEN_NON_XOBJECT_VISUALS = True
IMAGE_OUT_DIRNAME = "_extracted_images"
BATCH_SIZE_EMBED = 32
BATCH_SIZE_UPSERT = 64

# Parallelism for rule extraction / test-cards / pairwise sections
RULES_MAX_WORKERS = 8
PAIRWISE_MAX_WORKERS = 4


# ============================ PANDOC ============================

def ensure_pandoc():
    """Ensure pandoc is available; fallback to pypandoc downloader."""
    try:
        _ = pypandoc.get_pandoc_path()
    except OSError:
        print("Pandoc not found by pypandoc. Attempting to download via pypandoc...")
        pypandoc.download_pandoc()


def write_docx_with_pandoc(markdown_text: str, output_docx_path: str, reference_docx: str = None, also_save_md: str = None):
    ensure_pandoc()

    if also_save_md:
        os.makedirs(os.path.dirname(also_save_md), exist_ok=True)
        with open(also_save_md, "w", encoding="utf-8") as f:
            f.write(markdown_text)

    # Helpful for debugging your environment:
    try:
        ver = subprocess.check_output(["pandoc", "-v"]).decode(errors="ignore").splitlines()[0]
        print(f"[pandoc] {ver}")
    except Exception:
        pass

    with tempfile.NamedTemporaryFile(suffix=".md", delete=False, mode="w", encoding="utf-8") as tmp:
        tmp.write(markdown_text)
        tmp_md_path = tmp.name

    extra_args = ["--toc", "--toc-depth=3", "--number-sections"]
    if reference_docx:
        extra_args += ["--reference-doc", reference_docx]

    candidate_from_formats = [
        "gfm+pipe_tables+autolink_bare_uris",
        "markdown+pipe_tables+autolink_bare_uris",
        "markdown+fenced_code_blocks+autolink_bare_uris+pipe_tables",
    ]

    last_err = None
    for frm in candidate_from_formats:
        try:
            pypandoc.convert_file(
                tmp_md_path,
                to="docx",
                outputfile=output_docx_path,
                extra_args=extra_args,
                format=frm
            )
            print(f"Test plan saved to {output_docx_path} (from={frm})")
            return
        except RuntimeError as e:
            last_err = e
            print(f"[pandoc retry] failed with from={frm} -> {e}")

    raise last_err


# ============================ INGEST (PDF → pages with text+image desc → Chroma) ============================

@dataclass
class PageRecord:
    page: int
    text: str
    images: List[str]
    image_descriptions: List[str]


def _rect_area(rect: "fitz.Rect") -> float:
    try:
        return rect.get_area()
    except Exception:
        return max(0.0, (rect.x1 - rect.x0) * (rect.y1 - rect.y0))


def _get_text_rects(page: "fitz.Page") -> List["fitz.Rect"]:
    rects: List["fitz.Rect"] = []
    try:
        for b in page.get_text("blocks") or []:
            # (x0,y0,x1,y1, text, block_no, block_type, block_flags)
            if len(b) >= 8 and b[6] == 0:
                rects.append(fitz.Rect(b[0], b[1], b[2], b[3]))
    except Exception:
        pass
    return rects


def _visual_probe_has_nontext(page: "fitz.Page", text_rects: List["fitz.Rect"], probe_dpi: int, nonwhite_threshold: float) -> bool:
    scale = max(1e-6, probe_dpi / 72.0)
    pix = page.get_pixmap(matrix=fitz.Matrix(scale, scale), alpha=False)  # RGB8
    w, h = pix.width, pix.height
    if w == 0 or h == 0:
        return False
    img = np.frombuffer(pix.samples, dtype=np.uint8).reshape(h, w, pix.n)

    # mask text rects to white
    for r in text_rects:
        x0 = max(0, int(r.x0 * scale)); y0 = max(0, int(r.y0 * scale))
        x1 = min(w, int(r.x1 * scale)); y1 = min(h, int(r.y1 * scale))
        if x1 > x0 and y1 > y0:
            img[y0:y1, x0:x1, :] = 255

    nonwhite = np.any(img < 250, axis=2)
    ratio = nonwhite.sum() / (w * h)
    return ratio >= nonwhite_threshold


def _init_markitdown(model_name: str) -> MarkItDown:
    client = OpenAI()
    return MarkItDown(llm_client=client, llm_model=model_name)


def _describe_images_for_pages(pages: List[PageRecord], md: MarkItDown, max_images_per_page: Optional[int]) -> None:
    for p in pages:
        descs: List[str] = []
        imgs = list(p.images)
        if max_images_per_page is not None:
            imgs = imgs[:max_images_per_page]
        for img_path in imgs:
            try:
                res = md.convert(img_path)
                txt = getattr(res, "text_content", None) or str(res)
            except Exception as e:
                txt = f"Image description failed: {e}"
            descs.append(txt)
        p.image_descriptions = descs


def preprocess_pdf_and_ingest_to_chroma(
    pdf_path: str,
    chroma_path: str,
    collection_name: str,
    *,
    describe_images: bool = DESCRIBE_IMAGES,
    max_images_per_page: Optional[int] = MAX_IMAGES_PER_PAGE,
    ocr_enabled: bool = OCR_ENABLED,
    ocr_dpi: int = OCR_DPI,
    page_raster_dpi: int = PAGE_RASTER_DPI,
    probe_dpi: int = PROBE_DPI,
    probe_nonwhite_threshold: float = PROBE_NONWHITE_THRESHOLD,
    detect_inline_blocks: bool = DETECT_INLINE_BLOCKS,
    detect_drawings: bool = DETECT_DRAWINGS,
    min_drawing_area: float = MIN_DRAWING_AREA,
    rasterize_when_non_xobject_visuals: bool = RASTERIZE_WHEN_NON_XOBJECT_VISUALS,
    image_out_dirname: str = IMAGE_OUT_DIRNAME,
    embed_model: str = EMBED_MODEL,
) -> List[PageRecord]:
    """
    Return list of PageRecord(page, text, images, image_descriptions) for this PDF,
    and store each page's (text + image descriptions) into a Chroma collection.
    """
    # Init Chroma and embedding model
    chroma_client = chromadb.PersistentClient(path=chroma_path)
    collection = chroma_client.get_or_create_collection(
        name=collection_name,
        metadata={"hnsw:space": "cosine"}
    )
    embedder = SentenceTransformer(embed_model)

    # Init MarkItDown
    md = _init_markitdown(MARKITDOWN_MODEL)

    # Extract pages with visuals/text via PyMuPDF if available
    image_root = os.path.join(os.path.dirname(pdf_path), image_out_dirname)
    os.makedirs(image_root, exist_ok=True)

    pages: List[PageRecord] = []

    if fitz is None:
        # Fallback: just extract text via PyPDF2 (no inline visuals) and no raster logic
        reader = PdfReader(pdf_path)
        for i, page in enumerate(reader.pages, start=1):
            text = page.extract_text() or ""
            # OCR fallback (optional)
            if ocr_enabled and not text.strip() and convert_from_path and pytesseract:
                try:
                    imgs = convert_from_path(pdf_path, dpi=ocr_dpi, first_page=i, last_page=i)
                    if imgs:
                        text = pytesseract.image_to_string(imgs[0]) or ""
                except Exception:
                    pass
            pages.append(PageRecord(page=i, text=text, images=[], image_descriptions=[]))
    else:
        # Rich path: detect XObjects / inline / drawings / probe and rasterize as needed
        doc = fitz.open(pdf_path)
        base = os.path.basename(pdf_path).replace(os.sep, "_")

        for i, page in enumerate(doc, start=1):
            text = page.get_text("text") or ""
            saved_images: List[str] = []
            text_rects = _get_text_rects(page)

            # XObject images
            try:
                for img in page.get_images(full=True):
                    xref = img[0]
                    try:
                        info = doc.extract_image(xref)
                        ext = info.get("ext", "png")
                        out = os.path.join(image_root, f"{base}_page_{i}_xref_{xref}.{ext}")
                        with open(out, "wb") as f:
                            f.write(info["image"])
                        # normalize png mode
                        if ext.lower() == "png":
                            try:
                                with Image.open(out) as im:
                                    if im.mode not in ("L", "RGB"):
                                        im = im.convert("L")
                                    im.save(out)
                            except Exception:
                                pass
                        saved_images.append(out)
                    except Exception:
                        pass
            except Exception:
                pass

            has_xobject = len(saved_images) > 0

            inline_present = False
            drawings_present = False
            probe_positive = False

            if not has_xobject:
                # Inline image blocks
                if detect_inline_blocks:
                    try:
                        raw = page.get_text("rawdict") or {}
                        blocks = raw.get("blocks", []) if isinstance(raw, dict) else []
                        for b in blocks:
                            if b.get("type") == 1:  # image block
                                inline_present = True
                                break
                    except Exception:
                        pass

                # Vector drawings
                if detect_drawings and not inline_present:
                    try:
                        drawings = page.get_drawings() or []
                        for d in drawings:
                            r = d.get("rect")
                            if isinstance(r, fitz.Rect) and _rect_area(r) >= min_drawing_area:
                                drawings_present = True
                                break
                    except Exception:
                        pass

                if not inline_present and not drawings_present:
                    try:
                        probe_positive = _visual_probe_has_nontext(page, text_rects, probe_dpi, probe_nonwhite_threshold)
                    except Exception:
                        probe_positive = False
                else:
                    probe_positive = True

                # Full-page raster if visuals but no XObjects
                if rasterize_when_non_xobject_visuals and probe_positive:
                    try:
                        scale = page_raster_dpi / 72.0
                        pix = page.get_pixmap(matrix=fitz.Matrix(scale, scale))
                        out = os.path.join(image_root, f"{base}_page_{i}_raster.png")
                        pix.save(out)
                        saved_images.append(out)
                    except Exception:
                        pass

            # OCR fallback if text empty
            if ocr_enabled and not text.strip() and convert_from_path and pytesseract:
                try:
                    imgs = convert_from_path(pdf_path, dpi=ocr_dpi, first_page=i, last_page=i)
                    if imgs:
                        text = pytesseract.image_to_string(imgs[0]) or text
                except Exception:
                    pass

            pages.append(PageRecord(page=i, text=text, images=saved_images, image_descriptions=[]))

        doc.close()

    # Describe images if requested
    if describe_images:
        _describe_images_for_pages(pages, md, max_images_per_page)

    # Upsert into Chroma (page-level “text + image descriptions”)
    ids, documents, metadatas = [], [], []
    fname = os.path.basename(pdf_path)

    def _combine(p: PageRecord) -> str:
        base = (p.text or "").strip()
        if p.image_descriptions:
            join = "\n".join([f"[Image {k+1}] {d}" for k, d in enumerate(p.image_descriptions)])
            return f"{base}\n{join}" if base else join
        return base

    for p in pages:
        combined = _combine(p)
        if not combined:
            continue
        ids.append(f"{fname}_page_{p.page}")
        documents.append(combined)
        metadatas.append({
            "document_name": fname,
            "page": int(p.page),
            "image_files": json.dumps(p.images),
            "image_descriptions": json.dumps(p.image_descriptions),
            "source_path": pdf_path,
        })

    if documents:
        # embed + upsert
        vecs = []
        for i in range(0, len(documents), BATCH_SIZE_EMBED):
            chunk = documents[i:i+BATCH_SIZE_EMBED]
            embs = SentenceTransformer(EMBED_MODEL).encode(chunk, show_progress_bar=False)
            vecs.extend([e.tolist() for e in embs])

        for i in range(0, len(documents), BATCH_SIZE_UPSERT):
            collection.upsert(
                ids=ids[i:i+BATCH_SIZE_UPSERT],
                documents=documents[i:i+BATCH_SIZE_UPSERT],
                metadatas=metadatas[i:i+BATCH_SIZE_UPSERT],
                embeddings=vecs[i:i+BATCH_SIZE_UPSERT],
            )

    return pages


# ============================ SECTION BUILD (from preprocessed pages) ============================

def extract_sections_from_pages(
    pages: List[PageRecord],
    start_page: int = 12,
    pages_per_section: int = 10
) -> Tuple[Dict[str, str], List[str]]:
    """
    Build sections from *preprocessed* pages (text + image descriptions).
    Returns (sections_dict, section_names) with titles "Section k: Pages a-b".
    """
    # Build a 1-indexed map of combined page content
    bynum = {p.page: ((p.text or "").strip() + ("\n" + "\n".join([f"[Image {i+1}] {d}" for i, d in enumerate(p.image_descriptions)]) if p.image_descriptions else "")).strip()
             for p in pages}

    all_pages = sorted(bynum.keys())
    max_page = max(all_pages) if all_pages else 0

    sections: Dict[str, str] = {}
    section_idx = 1
    for i in range(start_page, max_page + 1, pages_per_section):
        chunk_pages = list(range(i, min(i + pages_per_section, max_page + 1)))
        if not chunk_pages:
            continue
        section_text = []
        for pg in chunk_pages:
            section_text.append(bynum.get(pg, ""))
        title = f"Section {section_idx}: Pages {chunk_pages[0]}-{chunk_pages[-1]}"
        sections[title] = "\n".join([t for t in section_text if t]).strip()
        section_idx += 1

    return sections, list(sections.keys())


# ============================ RULES / TEST CARDS / PAIRWISE (your original logic) ============================

def extract_rules_for_section(section_name, section_text):
    llm = ChatOpenAI(model_name=llm_model, openai_api_key=openai.api_key)
    prompt = (
        f"You are a MIL-STD-188 compliance and test planning expert.\n"
        f"Analyze the following section of a military standard and extract EVERY possible testable rule, specification, constraint, or requirement. "
        f"Rules MUST be extremely detailed, explicit, and step-by-step, and should include measurable criteria, acceptable ranges, and referenced figures or tables if mentioned. "
        f"For ambiguous or implicit requirements, describe a specific test strategy.\n"
        f"Generate a short, content-based TITLE for this section (do not use page numbers).\n"
        f"Organize your output as follows, using markdown headings and bolded text:\n\n"
        f"## [Section Title]\n"
        f"**Dependencies:**\n- List detailed dependencies as explicit tests, if any.\n\n"
        f"**Conflicts:**\n- List detected or possible conflicts and provide recommendations or mitigation steps.\n\n"
        f"**Test Rules:**\n1. (Very detailed, step-by-step numbered test rules)\n"
        f"\nSection Name: {section_name}\n\nSection Text:\n{section_text}\n\n"
        f"---\n"
        f"If you find truly nothing testable, reply: 'No testable rules in this section.'"
    )
    result = llm([HumanMessage(content=prompt)])
    return section_name, result.content


def synthesize_pairwise_test_plan(pair_name, rules_text_1, rules_text_2):
    llm = ChatOpenAI(model_name=llm_model, openai_api_key=openai.api_key)
    prompt = (
        "You are a senior QA documentation engineer.\n"
        "Given the DETAILED test rules and extracted section titles for two consecutive MIL-STD-188 sections, synthesize a single, logically organized, highly detailed test plan section.\n"
        "Combine rules, merge similar steps, cross-reference overlapping content, and call out dependencies or conflicts. "
        "Use a single, **content-based TITLE** for this combined section (not using page numbers). "
        "Keep bold markdown headings for 'Dependencies', 'Conflicts', and 'Test Rules'.\n"
        "Test rules must be extremely explicit, step-by-step, and cover ALL possible technical details and verification steps.\n"
        "Format the output using markdown.\n\n"
        "=== SECTION 1 ===\n" + rules_text_1 + "\n\n=== SECTION 2 ===\n" + rules_text_2 +
        "\n\n=== END ===\n\nOutput ONLY the combined test plan in the described format."
    )
    result = llm([HumanMessage(content=prompt)])
    return pair_name, result.content


def build_test_card_for_section(section_name, rules_markdown):
    llm = ChatOpenAI(model_name=llm_model, openai_api_key=openai.api_key)
    prompt = (
        "You are a QA test documentation assistant.\n"
        "From the following section rules (Markdown), generate a single Markdown pipe table named 'Test Card' "
        "that lists one row per test. Do NOT include any text before or after the table.\n"
        "Requirements:\n"
        "- Columns: Test ID | Test Title | Procedures | Executed | Pass | Fail | Notes\n"
        "- 'Procedures' should be concise numbered steps separated by <br> (e.g., '1) ...<br>2) ...').\n"
        "- Leave 'Executed', 'Pass', and 'Fail' empty with a checkbox symbol (use '☐'). Do NOT tick anything.\n"
        "- Derive Tests from the 'Test Rules' content. Use short, content-based titles.\n"
        "- Output ONLY the table in GitHub-style pipe-table format.\n\n"
        f"=== SECTION NAME ===\n{section_name}\n\n"
        f"=== SECTION RULES (MARKDOWN) ===\n{rules_markdown}\n\n"
        "=== END ==="
    )
    result = llm([HumanMessage(content=prompt)])
    table_md = result.content.strip()

    if '|' not in table_md:
        header = "| Test ID | Test Title | Procedures | Executed | Pass | Fail | Notes |\n"
        sep    = "|---|---|---|---|---|---|---|\n"
        table_md = header + sep + "| 1 | (LLM failed to tabulate) | See rules above | ☐ | ☐ | ☐ | |\n"
    return table_md


# ============================ MARKDOWN BUILD ============================

def _sanitize_markdown(md: str) -> str:
    md = md.replace("🔹 ", "- ").replace("• ", "- ").replace("– ", "- ")
    md = re.sub(r'^(\s*)(\d+)\)\s+', r'\1\2. ', md, flags=re.MULTILINE)
    md = re.sub(r'\*\*\s+(.*?)\s+\*\*', r'**\1**', md)
    return md.strip() + "\n"


def build_master_markdown(section_names, section_rule_reports, section_test_cards, pairwise_test_plans) -> str:
    parts = []
    parts.append("# MIL-STD-188 Automated Compliance Test Plan\n")
    parts.append("> Generated via automated extraction and synthesis pipeline (preprocessed with image understanding + vector DB).\n")
    parts.append("\n\\newpage\n")

    for sec in section_names:
        rules_md = section_rule_reports.get(sec, "").strip()
        if rules_md:
            parts.append(_sanitize_markdown(rules_md))
            parts.append("\n")

        test_card = section_test_cards.get(sec, "").strip()
        if test_card:
            parts.append("### Test Card\n\n")
            parts.append(test_card)
            parts.append("\n")

        if sec in pairwise_test_plans:
            combined_md = pairwise_test_plans[sec].strip()
            parts.append(_sanitize_markdown(combined_md))
            parts.append("\n")

        parts.append("\n\\newpage\n")

    return "\n".join(parts)


# ============================ MAIN ============================

if __name__ == "__main__":
    # A) PREPROCESS & INGEST: page-level (text + image descriptions) → Chroma
    pages = preprocess_pdf_and_ingest_to_chroma(
        pdf_path=pdf_path,
        chroma_path=chroma_path,
        collection_name=collection_name,
        describe_images=DESCRIBE_IMAGES,
        max_images_per_page=MAX_IMAGES_PER_PAGE,
        ocr_enabled=OCR_ENABLED,
        ocr_dpi=OCR_DPI,
        page_raster_dpi=PAGE_RASTER_DPI,
        probe_dpi=PROBE_DPI,
        probe_nonwhite_threshold=PROBE_NONWHITE_THRESHOLD,
        detect_inline_blocks=DETECT_INLINE_BLOCKS,
        detect_drawings=DETECT_DRAWINGS,
        min_drawing_area=MIN_DRAWING_AREA,
        rasterize_when_non_xobject_visuals=RASTERIZE_WHEN_NON_XOBJECT_VISUALS,
        image_out_dirname=IMAGE_OUT_DIRNAME,
        embed_model=EMBED_MODEL,
    )

    # B) Build sections FROM the preprocessed pages (this now includes image descriptions inline)
    sections, section_names = extract_sections_from_pages(
        pages,
        start_page=12,
        pages_per_section=10
    )

    # C) Extract detailed rules per section
    section_rule_results: Dict[str, str] = {}
    with ThreadPoolExecutor(max_workers=RULES_MAX_WORKERS) as executor:
        futures = [executor.submit(extract_rules_for_section, name, text) for name, text in sections.items()]
        for fut in as_completed(futures):
            section_name, rules_md = fut.result()
            section_rule_results[section_name] = rules_md
            print(f"\n--- {section_name} ---\n{rules_md}\n{'='*60}")

    # D) Build Test Card per section
    section_test_cards: Dict[str, str] = {}
    for sec in section_names:
        rules_md = section_rule_results.get(sec, "")
        if not rules_md:
            continue
        table_md = build_test_card_for_section(sec, rules_md)
        section_test_cards[sec] = table_md
        print(f"\n--- Test Card for {sec} ---\n{table_md}\n{'='*60}")

    # E) Pairwise combined test plans
    pairwise_test_plans: Dict[str, str] = {}
    with ThreadPoolExecutor(max_workers=PAIRWISE_MAX_WORKERS) as executor:
        futures = []
        for i in range(len(section_names) - 1):
            s1 = section_names[i]
            s2 = section_names[i + 1]
            rules1 = section_rule_results.get(s1, "")
            rules2 = section_rule_results.get(s2, "")
            futures.append(executor.submit(synthesize_pairwise_test_plan, s1, rules1, rules2))
        for fut in as_completed(futures):
            pair_key, combined_md = fut.result()
            pairwise_test_plans[pair_key] = combined_md
            print(f"\n--- Pair: {pair_key} ---\n{combined_md}\n{'='*60}")

    # F) Build master Markdown and convert to DOCX (Pandoc)
    master_md = build_master_markdown(section_names, section_rule_results, section_test_cards, pairwise_test_plans)
    write_docx_with_pandoc(
        master_md,
        output_docx_path=output_docx_path,
        reference_docx=None,        # optional: custom Word style template
        also_save_md=output_markdown_path
    )


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/229 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/212 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

  llm = ChatOpenAI(model_name=llm_model, openai_api_key=openai.api_key)
  result = llm([HumanMessage(content=prompt)])



--- Section 6: Pages 62-71 ---
## Appendix B: Acronyms and Abbreviations

**Dependencies:**
- The list of acronyms and abbreviations must be consistent with their usage throughout the entire MIL-STD-188-203-1A document.
- Definitions must match those in referenced documents (if any) and be unambiguous.
- Acronyms and abbreviations should only be used as defined in this appendix throughout the standard.

**Conflicts:**
- **Potential Conflict:** If an acronym or abbreviation is defined differently elsewhere in the standard or in referenced standards, ambiguity may arise.
    - **Mitigation:** Conduct a document-wide search for each acronym and abbreviation. If discrepancies are found, flag for correction to ensure a single, authoritative definition per term.
- **Potential Conflict:** Use of undefined acronyms in other sections.
    - **Mitigation:** Cross-reference all acronyms/abbreviations found in the standard against Appendix B. Any undefined terms must be added to Appendix B or app

In [None]:
import os
import re
import json
import tempfile
import subprocess
from dataclasses import dataclass
from typing import Dict, List, Tuple, Optional
from concurrent.futures import ThreadPoolExecutor, as_completed

# ---------- PDF & I/O ----------
from PyPDF2 import PdfReader
import pypandoc

# ---------- LLM (OpenAI via LangChain for your rules/plan/test-card) ----------
from langchain.chat_models import ChatOpenAI
from langchain.schema import HumanMessage

# ---------- Colab secret helper ----------
try:
    from google.colab import userdata  # type: ignore
except Exception:  # pragma: no cover
    userdata = None

# ---------- Vector DB + embeddings ----------
import chromadb
from sentence_transformers import SentenceTransformer

# ---------- Images & PDF visuals (PyMuPDF) ----------
import numpy as np
from PIL import Image

try:
    import fitz  # PyMuPDF
except Exception:
    fitz = None

# Optional OCR
try:
    import pytesseract  # type: ignore
except Exception:
    pytesseract = None

try:
    from pdf2image import convert_from_path  # type: ignore
except Exception:
    convert_from_path = None

# ---------- MarkItDown for image description (OpenAI multimodal) ----------
from openai import OpenAI
from markitdown import MarkItDown

# ---------- DOCX post-processing (borders) ----------
from docx import Document
from docx.oxml import OxmlElement
from docx.oxml.ns import qn

# ============================ CONFIG ============================

# OpenAI key (colab or env)
import openai
openai.api_key = userdata.get('OpenaAIAPI') if userdata else os.environ.get("OPENAI_API_KEY", "")
os.environ["OPENAI_API_KEY"] = openai.api_key or os.environ.get("OPENAI_API_KEY", "")

# Model for rules / pairwise / test-cards
llm_model = "gpt-4.1"

# PDF & output paths (from your original code)
pdf_path = "/GD/My Drive/JITC/military_standards_pdf/MIL-STD-188-203-1A_1995.pdf"
output_docx_path = "/GD/My Drive/JITC/military_standards_pdf/MIL-STD-188-Auto-TestPlan.docx"
output_markdown_path = output_docx_path.replace(".docx", ".md")  # save master .md too

# Vector DB locations derived from the same folder as the PDF
folder_path = os.path.dirname(pdf_path)
chroma_path = os.path.join(folder_path, "chroma_store")       # persistent chroma
collection_name = "military_standards"                        # you can rename

# Ingestion knobs
EMBED_MODEL = "multi-qa-mpnet-base-dot-v1"
MARKITDOWN_MODEL = "o4-mini"
DESCRIBE_IMAGES = True
MAX_IMAGES_PER_PAGE = None
OCR_ENABLED = False
OCR_DPI = 220
PAGE_RASTER_DPI = 144
PROBE_DPI = 72
PROBE_NONWHITE_THRESHOLD = 0.01
DETECT_INLINE_BLOCKS = True
DETECT_DRAWINGS = True
MIN_DRAWING_AREA = 2000.0
RASTERIZE_WHEN_NON_XOBJECT_VISUALS = True
IMAGE_OUT_DIRNAME = "_extracted_images"
BATCH_SIZE_EMBED = 32
BATCH_SIZE_UPSERT = 64

# Parallelism for rule extraction / test-cards / pairwise sections
RULES_MAX_WORKERS = 8
PAIRWISE_MAX_WORKERS = 4

# Pandoc raw OpenXML page break for DOCX (works in Word)
PAGE_BREAK_MD = "\n```{=openxml}\n<w:p><w:r><w:br w:type=\"page\"/></w:r></w:p>\n```\n"

# ============================ PANDOC ============================

def ensure_pandoc():
    """Ensure pandoc is available; fallback to pypandoc downloader."""
    try:
        _ = pypandoc.get_pandoc_path()
    except OSError:
        print("Pandoc not found by pypandoc. Attempting to download via pypandoc...")
        pypandoc.download_pandoc()

def _ensure_table_borders_docx(docx_path: str):
    """
    Post-process the DOCX to ensure every table has visible borders.
    This sidesteps needing Pandoc's +attributes/custom styles.
    """
    doc = Document(docx_path)
    for table in doc.tables:
        tbl_pr = table._tbl.tblPr
        # remove existing borders
        for el in tbl_pr.findall(qn('w:tblBorders')):
            tbl_pr.remove(el)
        # add new borders
        tbl_borders = OxmlElement('w:tblBorders')
        for edge in ('top', 'left', 'bottom', 'right', 'insideH', 'insideV'):
            element = OxmlElement(f'w:{edge}')
            element.set(qn('w:val'), 'single')
            element.set(qn('w:sz'), '8')      # ~0.5pt
            element.set(qn('w:space'), '0')
            element.set(qn('w:color'), 'auto')
            tbl_borders.append(element)
        tbl_pr.append(tbl_borders)
    doc.save(docx_path)

def write_docx_with_pandoc(markdown_text: str, output_docx_path: str, reference_docx: str = None, also_save_md: str = None):
    ensure_pandoc()

    if also_save_md:
        os.makedirs(os.path.dirname(also_save_md), exist_ok=True)
        with open(also_save_md, "w", encoding="utf-8") as f:
            f.write(markdown_text)

    # Helpful for debugging your environment:
    try:
        ver = subprocess.check_output(["pandoc", "-v"]).decode(errors="ignore").splitlines()[0]
        print(f"[pandoc] {ver}")
    except Exception:
        pass

    with tempfile.NamedTemporaryFile(suffix=".md", delete=False, mode="w", encoding="utf-8") as tmp:
        tmp.write(markdown_text)
        tmp_md_path = tmp.name

    extra_args = ["--toc", "--toc-depth=3", "--number-sections"]
    if reference_docx:
        extra_args += ["--reference-doc", reference_docx]

    # NOTE: Removed +attributes to avoid the "Unknown extension: attributes" error on older pandoc.
    # Keep raw OpenXML + fenced_code_blocks so page breaks render correctly.
    from_format = (
        "markdown"
        "+raw_attribute"
        "+fenced_code_blocks"
        "+fenced_divs"
        "+pipe_tables"
        "+autolink_bare_uris"
    )

    pypandoc.convert_file(
        tmp_md_path,
        to="docx",
        outputfile=output_docx_path,
        extra_args=extra_args,
        format=from_format
    )
    print(f"Test plan saved to {output_docx_path} (from={from_format})")

    # Ensure all tables have borders (since custom-style wasn't possible without +attributes)
    _ensure_table_borders_docx(output_docx_path)

# ============================ INGEST (PDF → pages with text+image desc → Chroma) ============================

@dataclass
class PageRecord:
    page: int
    text: str
    images: List[str]
    image_descriptions: List[str]

def _rect_area(rect: "fitz.Rect") -> float:
    try:
        return rect.get_area()
    except Exception:
        return max(0.0, (rect.x1 - rect.x0) * (rect.y1 - rect.y0))

def _get_text_rects(page: "fitz.Page") -> List["fitz.Rect"]:
    rects: List["fitz.Rect"] = []
    try:
        for b in page.get_text("blocks") or []:
            # (x0,y0,x1,y1, text, block_no, block_type, block_flags)
            if len(b) >= 8 and b[6] == 0:
                rects.append(fitz.Rect(b[0], b[1], b[2], b[3]))
    except Exception:
        pass
    return rects

def _visual_probe_has_nontext(page: "fitz.Page", text_rects: List["fitz.Rect"], probe_dpi: int, nonwhite_threshold: float) -> bool:
    scale = max(1e-6, probe_dpi / 72.0)
    pix = page.get_pixmap(matrix=fitz.Matrix(scale, scale), alpha=False)  # RGB8
    w, h = pix.width, pix.height
    if w == 0 or h == 0:
        return False
    img = np.frombuffer(pix.samples, dtype=np.uint8).reshape(h, w, pix.n)

    # mask text rects to white
    for r in text_rects:
        x0 = max(0, int(r.x0 * scale)); y0 = max(0, int(r.y0 * scale))
        x1 = min(w, int(r.x1 * scale)); y1 = min(h, int(r.y1 * scale))
        if x1 > x0 and y1 > y0:
            img[y0:y1, x0:x1, :] = 255

    nonwhite = np.any(img < 250, axis=2)
    ratio = nonwhite.sum() / (w * h)
    return ratio >= nonwhite_threshold

def _init_markitdown(model_name: str) -> MarkItDown:
    client = OpenAI()
    return MarkItDown(llm_client=client, llm_model=model_name)

def _describe_images_for_pages(pages: List[PageRecord], md: MarkItDown, max_images_per_page: Optional[int]) -> None:
    for p in pages:
        descs: List[str] = []
        imgs = list(p.images)
        if max_images_per_page is not None:
            imgs = imgs[:max_images_per_page]
        for img_path in imgs:
            try:
                res = md.convert(img_path)
                txt = getattr(res, "text_content", None) or str(res)
            except Exception as e:
                txt = f"Image description failed: {e}"
            descs.append(txt)
        p.image_descriptions = descs

def preprocess_pdf_and_ingest_to_chroma(
    pdf_path: str,
    chroma_path: str,
    collection_name: str,
    *,
    describe_images: bool = DESCRIBE_IMAGES,
    max_images_per_page: Optional[int] = MAX_IMAGES_PER_PAGE,
    ocr_enabled: bool = OCR_ENABLED,
    ocr_dpi: int = OCR_DPI,
    page_raster_dpi: int = PAGE_RASTER_DPI,
    probe_dpi: int = PROBE_DPI,
    probe_nonwhite_threshold: float = PROBE_NONWHITE_THRESHOLD,
    detect_inline_blocks: bool = DETECT_INLINE_BLOCKS,
    detect_drawings: bool = DETECT_DRAWINGS,
    min_drawing_area: float = MIN_DRAWING_AREA,
    rasterize_when_non_xobject_visuals: bool = RASTERIZE_WHEN_NON_XOBJECT_VISUALS,
    image_out_dirname: str = IMAGE_OUT_DIRNAME,
    embed_model: str = EMBED_MODEL,
) -> List[PageRecord]:
    """
    Return list of PageRecord(page, text, images, image_descriptions) for this PDF,
    and store each page's (text + image descriptions) into a Chroma collection.
    """
    # Init Chroma and embedding model
    chroma_client = chromadb.PersistentClient(path=chroma_path)
    collection = chroma_client.get_or_create_collection(
        name=collection_name,
        metadata={"hnsw:space": "cosine"}
    )
    embedder = SentenceTransformer(embed_model)

    # Init MarkItDown
    md = _init_markitdown(MARKITDOWN_MODEL)

    # Extract pages with visuals/text via PyMuPDF if available
    image_root = os.path.join(os.path.dirname(pdf_path), image_out_dirname)
    os.makedirs(image_root, exist_ok=True)

    pages: List[PageRecord] = []

    if fitz is None:
        # Fallback: just extract text via PyPDF2 (no inline visuals) and no raster logic
        reader = PdfReader(pdf_path)
        for i, page in enumerate(reader.pages, start=1):
            text = page.extract_text() or ""
            # OCR fallback (optional)
            if ocr_enabled and not text.strip() and convert_from_path and pytesseract:
                try:
                    imgs = convert_from_path(pdf_path, dpi=ocr_dpi, first_page=i, last_page=i)
                    if imgs:
                        text = pytesseract.image_to_string(imgs[0]) or ""
                except Exception:
                    pass
            pages.append(PageRecord(page=i, text=text, images=[], image_descriptions=[]))
    else:
        # Rich path: detect XObjects / inline / drawings / probe and rasterize as needed
        doc = fitz.open(pdf_path)
        base = os.path.basename(pdf_path).replace(os.sep, "_")

        for i, page in enumerate(doc, start=1):
            text = page.get_text("text") or ""
            saved_images: List[str] = []
            text_rects = _get_text_rects(page)

            # XObject images
            try:
                for img in page.get_images(full=True):
                    xref = img[0]
                    try:
                        info = doc.extract_image(xref)
                        ext = info.get("ext", "png")
                        out = os.path.join(image_root, f"{base}_page_{i}_xref_{xref}.{ext}")
                        with open(out, "wb") as f:
                            f.write(info["image"])
                        # normalize png mode
                        if ext.lower() == "png":
                            try:
                                with Image.open(out) as im:
                                    if im.mode not in ("L", "RGB"):
                                        im = im.convert("L")
                                    im.save(out)
                            except Exception:
                                pass
                        saved_images.append(out)
                    except Exception:
                        pass
            except Exception:
                pass

            has_xobject = len(saved_images) > 0

            inline_present = False
            drawings_present = False
            probe_positive = False

            if not has_xobject:
                # Inline image blocks
                if detect_inline_blocks:
                    try:
                        raw = page.get_text("rawdict") or {}
                        blocks = raw.get("blocks", []) if isinstance(raw, dict) else []
                        for b in blocks:
                            if b.get("type") == 1:  # image block
                                inline_present = True
                                break
                    except Exception:
                        pass

                # Vector drawings
                if detect_drawings and not inline_present:
                    try:
                        drawings = page.get_drawings() or []
                        for d in drawings:
                            r = d.get("rect")
                            if isinstance(r, fitz.Rect) and _rect_area(r) >= min_drawing_area:
                                drawings_present = True
                                break
                    except Exception:
                        pass

                if not inline_present and not drawings_present:
                    try:
                        probe_positive = _visual_probe_has_nontext(page, text_rects, probe_dpi, probe_nonwhite_threshold)
                    except Exception:
                        probe_positive = False
                else:
                    probe_positive = True

                # Full-page raster if visuals but no XObjects
                if rasterize_when_non_xobject_visuals and probe_positive:
                    try:
                        scale = page_raster_dpi / 72.0
                        pix = page.get_pixmap(matrix=fitz.Matrix(scale, scale))
                        out = os.path.join(image_root, f"{base}_page_{i}_raster.png")
                        pix.save(out)
                        saved_images.append(out)
                    except Exception:
                        pass

            # OCR fallback if text empty
            if ocr_enabled and not text.strip() and convert_from_path and pytesseract:
                try:
                    imgs = convert_from_path(pdf_path, dpi=ocr_dpi, first_page=i, last_page=i)
                    if imgs:
                        text = pytesseract.image_to_string(imgs[0]) or text
                except Exception:
                    pass

            pages.append(PageRecord(page=i, text=text, images=saved_images, image_descriptions=[]))

        doc.close()

    # Describe images if requested
    if describe_images:
        _describe_images_for_pages(pages, md, max_images_per_page)

    # Upsert into Chroma (page-level “text + image descriptions”)
    ids, documents, metadatas = [], [], []
    fname = os.path.basename(pdf_path)

    def _combine(p: PageRecord) -> str:
        base = (p.text or "").strip()
        if p.image_descriptions:
            join = "\n".join([f"[Image {k+1}] {d}" for k, d in enumerate(p.image_descriptions)])
            return f"{base}\n{join}" if base else join
        return base

    for p in pages:
        combined = _combine(p)
        if not combined:
            continue
        ids.append(f"{fname}_page_{p.page}")
        documents.append(combined)
        metadatas.append({
            "document_name": fname,
            "page": int(p.page),
            "image_files": json.dumps(p.images),
            "image_descriptions": json.dumps(p.image_descriptions),
            "source_path": pdf_path,
        })

    if documents:
        # embed + upsert
        vecs = []
        for i in range(0, len(documents), BATCH_SIZE_EMBED):
            chunk = documents[i:i+BATCH_SIZE_EMBED]
            embs = SentenceTransformer(EMBED_MODEL).encode(chunk, show_progress_bar=False)
            vecs.extend([e.tolist() for e in embs])

        for i in range(0, len(documents), BATCH_SIZE_UPSERT):
            collection.upsert(
                ids=ids[i:i+BATCH_SIZE_UPSERT],
                documents=documents[i:i+BATCH_SIZE_UPSERT],
                metadatas=metadatas[i:i+BATCH_SIZE_UPSERT],
                embeddings=vecs[i:i+BATCH_SIZE_UPSERT],
            )

    return pages

# ============================ SECTION BUILD (from preprocessed pages) ============================

def extract_sections_from_pages(
    pages: List[PageRecord],
    start_page: int = 12,
    pages_per_section: int = 10
) -> Tuple[Dict[str, str], List[str]]:
    """
    Build sections from *preprocessed* pages (text + image descriptions).
    Returns (sections_dict, section_names) with titles "Section k: Pages a-b".
    """
    bynum = {p.page: ((p.text or "").strip() + ("\n" + "\n".join([f"[Image {i+1}] {d}" for i, d in enumerate(p.image_descriptions)]) if p.image_descriptions else "")).strip()
             for p in pages}

    all_pages = sorted(bynum.keys())
    max_page = max(all_pages) if all_pages else 0

    sections: Dict[str, str] = {}
    section_idx = 1
    for i in range(start_page, max_page + 1, pages_per_section):
        chunk_pages = list(range(i, min(i + pages_per_section, max_page + 1)))
        if not chunk_pages:
            continue
        section_text = []
        for pg in chunk_pages:
            section_text.append(bynum.get(pg, ""))
        title = f"Section {section_idx}: Pages {chunk_pages[0]}-{chunk_pages[-1]}"
        sections[title] = "\n".join([t for t in section_text if t]).strip()
        section_idx += 1

    return sections, list(sections.keys())

# ============================ RULES / TEST CARDS / PAIRWISE ============================

def extract_rules_for_section(section_name, section_text):
    llm = ChatOpenAI(model_name=llm_model, openai_api_key=openai.api_key)
    prompt = (
        f"You are a MIL-STD-188 compliance and test planning expert.\n"
        f"Analyze the following section of a military standard and extract EVERY possible testable rule, specification, constraint, or requirement. "
        f"Rules MUST be extremely detailed, explicit, and step-by-step, and should include measurable criteria, acceptable ranges, and referenced figures or tables if mentioned. "
        f"For ambiguous or implicit requirements, describe a specific test strategy.\n"
        f"Generate a short, content-based TITLE for this section (do not use page numbers).\n"
        f"Organize your output as follows, using markdown headings and bolded text:\n\n"
        f"## [Section Title]\n"
        f"**Dependencies:**\n- List detailed dependencies as explicit tests, if any.\n\n"
        f"**Conflicts:**\n- List detected or possible conflicts and provide recommendations or mitigation steps.\n\n"
        f"**Test Rules:**\n1. (Very detailed, step-by-step numbered test rules)\n"
        f"\nSection Name: {section_name}\n\nSection Text:\n{section_text}\n\n"
        f"---\n"
        f"If you find truly nothing testable, reply: 'No testable rules in this section.'"
    )
    result = llm([HumanMessage(content=prompt)])
    return section_name, result.content

def synthesize_pairwise_test_plan(pair_name, rules_text_1, rules_text_2):
    llm = ChatOpenAI(model_name=llm_model, openai_api_key=openai.api_key)
    prompt = (
        "You are a senior QA documentation engineer.\n"
        "Given the DETAILED test rules and extracted section titles for two consecutive MIL-STD-188 sections, synthesize a single, logically organized, highly detailed test plan section.\n"
        "Combine rules, merge similar steps, cross-reference overlapping content, and call out dependencies or conflicts. "
        "Use a single, **content-based TITLE** for this combined section (not using page numbers). "
        "Keep bold markdown headings for 'Dependencies', 'Conflicts', and 'Test Rules'.\n"
        "Test rules must be extremely explicit, step-by-step, and cover ALL possible technical details and verification steps.\n"
        "Format the output using markdown.\n\n"
        "=== SECTION 1 ===\n" + rules_text_1 + "\n\n=== SECTION 2 ===\n" + rules_text_2 +
        "\n\n=== END ===\n\nOutput ONLY the combined test plan in the described format."
    )
    result = llm([HumanMessage(content=prompt)])
    return pair_name, result.content

def build_test_card_for_section(section_name, rules_markdown):
    llm = ChatOpenAI(model_name=llm_model, openai_api_key=openai.api_key)
    prompt = (
        "You are a QA test documentation assistant.\n"
        "From the following section rules (Markdown), generate a single Markdown pipe table named 'Test Card' "
        "that lists one row per test. Do NOT include any text before or after the table.\n"
        "Requirements:\n"
        "- Columns: Test ID | Test Title | Procedures | Executed | Pass | Fail | Notes\n"
        "- 'Procedures' should be concise numbered steps separated by <br> (e.g., '1) ...<br>2) ...').\n"
        "- Leave 'Executed', 'Pass', and 'Fail' empty with a checkbox symbol (use '☐'). Do NOT tick anything.\n"
        "- Derive Tests from the 'Test Rules' content. Use short, content-based titles.\n"
        "- Output ONLY the table in GitHub-style pipe-table format.\n\n"
        f"=== SECTION NAME ===\n{section_name}\n\n"
        f"=== SECTION RULES (MARKDOWN) ===\n{rules_markdown}\n\n"
        "=== END ==="
    )
    result = llm([HumanMessage(content=prompt)])
    table_md = result.content.strip()

    if '|' not in table_md:
        header = "| Test ID | Test Title | Procedures | Executed | Pass | Fail | Notes |\n"
        sep    = "|---|---|---|---|---|---|---|\n"
        table_md = header + sep + "| 1 | (LLM failed to tabulate) | See rules above | ☐ | ☐ | ☐ | |\n"
    return table_md

# ============================ MARKDOWN BUILD ============================

def _sanitize_markdown(md: str) -> str:
    md = md.replace("🔹 ", "- ").replace("• ", "- ").replace("– ", "- ")
    md = re.sub(r'^(\s*)(\d+)\)\s+', r'\1\2. ', md, flags=re.MULTILINE)
    md = re.sub(r'\*\*\s+(.*?)\s+\*\*', r'**\1**', md)
    return md.strip() + "\n"

def build_master_markdown(section_names, section_rule_reports, section_test_cards, pairwise_test_plans) -> str:
    parts = []
    parts.append("# MIL-STD-188 Automated Compliance Test Plan\n")
    parts.append("> Generated via automated extraction and synthesis pipeline (preprocessed with image understanding + vector DB).\n")
    parts.append(PAGE_BREAK_MD)  # proper DOCX page break

    for sec in section_names:
        rules_md = section_rule_reports.get(sec, "").strip()
        if rules_md:
            parts.append(_sanitize_markdown(rules_md))
            parts.append("\n")

        # Plain table (we'll set borders in DOCX post-process)
        test_card = section_test_cards.get(sec, "").strip()
        if test_card:
            parts.append("### Test Card\n\n")
            parts.append(test_card.rstrip() + "\n\n")

        if sec in pairwise_test_plans:
            combined_md = pairwise_test_plans[sec].strip()
            parts.append(_sanitize_markdown(combined_md))
            parts.append("\n")

        # real DOCX page break between sections
        parts.append(PAGE_BREAK_MD)

    return "\n".join(parts)

# ============================ MAIN ============================

if __name__ == "__main__":
    # A) PREPROCESS & INGEST: page-level (text + image descriptions) → Chroma
    pages = preprocess_pdf_and_ingest_to_chroma(
        pdf_path=pdf_path,
        chroma_path=chroma_path,
        collection_name=collection_name,
        describe_images=DESCRIBE_IMAGES,
        max_images_per_page=MAX_IMAGES_PER_PAGE,
        ocr_enabled=OCR_ENABLED,
        ocr_dpi=OCR_DPI,
        page_raster_dpi=PAGE_RASTER_DPI,
        probe_dpi=PROBE_DPI,
        probe_nonwhite_threshold=PROBE_NONWHITE_THRESHOLD,
        detect_inline_blocks=DETECT_INLINE_BLOCKS,
        detect_drawings=DETECT_DRAWINGS,
        min_drawing_area=MIN_DRAWING_AREA,
        rasterize_when_non_xobject_visuals=RASTERIZE_WHEN_NON_XOBJECT_VISUALS,
        image_out_dirname=IMAGE_OUT_DIRNAME,
        embed_model=EMBED_MODEL,
    )

    # B) Build sections FROM the preprocessed pages (this now includes image descriptions inline)
    sections, section_names = extract_sections_from_pages(
        pages,
        start_page=12,
        pages_per_section=10
    )

    # C) Extract detailed rules per section
    section_rule_results: Dict[str, str] = {}
    with ThreadPoolExecutor(max_workers=RULES_MAX_WORKERS) as executor:
        futures = [executor.submit(extract_rules_for_section, name, text) for name, text in sections.items()]
        for fut in as_completed(futures):
            section_name, rules_md = fut.result()
            section_rule_results[section_name] = rules_md
            print(f"\n--- {section_name} ---\n{rules_md}\n{'='*60}")

    # D) Build Test Card per section
    section_test_cards: Dict[str, str] = {}
    for sec in section_names:
        rules_md = section_rule_results.get(sec, "")
        if not rules_md:
            continue
        table_md = build_test_card_for_section(sec, rules_md)
        section_test_cards[sec] = table_md
        print(f"\n--- Test Card for {sec} ---\n{table_md}\n{'='*60}")

    # E) Pairwise combined test plans
    pairwise_test_plans: Dict[str, str] = {}
    with ThreadPoolExecutor(max_workers=PAIRWISE_MAX_WORKERS) as executor:
        futures = []
        for i in range(len(section_names) - 1):
            s1 = section_names[i]
            s2 = section_names[i + 1]
            rules1 = section_rule_results.get(s1, "")
            rules2 = section_rule_results.get(s2, "")
            futures.append(executor.submit(synthesize_pairwise_test_plan, s1, rules1, rules2))
        for fut in as_completed(futures):
            pair_key, combined_md = fut.result()
            pairwise_test_plans[pair_key] = combined_md
            print(f"\n--- Pair: {pair_key} ---\n{combined_md}\n{'='*60}")

    # F) Build master Markdown and convert to DOCX
    master_md = build_master_markdown(section_names, section_rule_results, section_test_cards, pairwise_test_plans)
    write_docx_with_pandoc(
        master_md,
        output_docx_path=output_docx_path,
        reference_docx=None,        # optional: custom Word style template
        also_save_md=output_markdown_path
    )



--- Section 6: Pages 62-71 ---
## DTS-to-TDS Parallel Computer Interface: Input Requirements, Error Status, and Interrupt Handling

**Dependencies:**
- **MIL-STD-1397 Type A, Category I (NTDS SLOW) Interface Compliance:** All electrical, timing, and signal characteristics must comply with this referenced standard.
- **Tables and Figures:** Testing requires reference to TABLE VI (Error Status Summary), TABLE VII (Bit Assignments), TABLE VIII (Interrupt Codes), FIGURE 11 (Signal Flow Diagram), and cross-referencing with 5.2.4.1 and TABLE V for K0–K16 parity group definitions.
- **Timing Parameters:** Requirements for timing of EIR reset and data placement depend on specifics in section 30.3.

**Conflicts:**
- **Ambiguity in “Allotted Time” for EIR Reset:** The phrase "after waiting the allotted time as specified in 30.3" requires test planners to obtain precise timing limits from section 30.3. If not provided or ambiguous, the test must validate for all reasonable edge cases and recomme