# Step 2: LLM Queries – Set 1 (A1–A7) and Set 2 (B1–B7)

This notebook runs **both** prompt sets in one place:
- **Set 1**: A1 (FSN), A2 (semantic tag), A3 (definition status), A4–A7 (parents, grandparents, children, siblings).
- **Set 2**: B1 (official name), B2 (kind), B3 (category), B4–B7 (broader, grandparents, narrower, peers).

**Prerequisites**: Run `step1_ground_truth.ipynb` in the `ground_truth/` folder first.

Run **Step 3** (`step3_accuracy.ipynb`) after this notebook.

## Configuration

In [1]:
import os
import re
import time
from datetime import datetime
from pathlib import Path

import pandas as pd

# ============================================================
# Configuration
# ============================================================

# --- Detect LLM folder and paths ---
_cwd = Path(".").resolve()
if _cwd.name.startswith("testing_"):
    REPO_ROOT = _cwd.parent
    LLM_NAME = _cwd.name.replace("testing_", "")
else:
    REPO_ROOT = _cwd
    LLM_NAME = "gemini"

OUTPUT_ROOT = (REPO_ROOT / "output").resolve()
PIPELINE_ROOT = OUTPUT_ROOT / LLM_NAME
OUTPUT_ROOT.mkdir(parents=True, exist_ok=True)
PIPELINE_ROOT.mkdir(parents=True, exist_ok=True)

# Shared ground truth (from step1_ground_truth.ipynb in ground_truth/ folder)
GT_ROOT = OUTPUT_ROOT / "ground_truth"

# Create NEW run directory for this LLM
existing_runs = [d for d in PIPELINE_ROOT.iterdir() if d.is_dir() and d.name.startswith("run_")]
RUN_ID = len(existing_runs) + 1
RUN_DIR = PIPELINE_ROOT / f"run_{RUN_ID:03d}"

# Step directories
SET1_DIR = RUN_DIR / "step2_llm_set1"
SET2_DIR = RUN_DIR / "step2_llm_set2"

# Output files
SET1_OUT = SET1_DIR / "set1_llm_output.csv"
SET2_OUT = SET2_DIR / "set2_llm_output.csv"

# Create directories
for d in [SET1_DIR, SET2_DIR]:
    d.mkdir(parents=True, exist_ok=True)

print("RUN_DIR:", RUN_DIR)
print("GT_ROOT:", GT_ROOT)

RUN_DIR: C:\Users\kanik\OneDrive\Documents\llm_as_ontology_server\llm-as-ontology-server\output\gemini\run_001
GT_ROOT: C:\Users\kanik\OneDrive\Documents\llm_as_ontology_server\llm-as-ontology-server\output\ground_truth


## Load Validated Concepts from Shared Ground Truth

In [2]:
# ============================================================
# Load validated concepts from shared Ground Truth (Step 1)
# ============================================================

VALIDATED_CONCEPTS_PATH = GT_ROOT / "validated_concepts.csv"
if not VALIDATED_CONCEPTS_PATH.exists():
    raise FileNotFoundError(
        f"Validated concepts not found at {VALIDATED_CONCEPTS_PATH}\n"
        f"Run step1_ground_truth.ipynb in the ground_truth/ folder first!"
    )

val_df = pd.read_csv(VALIDATED_CONCEPTS_PATH, dtype=str).fillna("")
CONCEPT_TERMS = val_df["concept_term"].tolist()

# Show replacement summary
replaced = val_df[val_df["status"] == "replaced"]
if not replaced.empty:
    print("Replaced concepts (original -> replacement):")
    for _, row in replaced.iterrows():
        print(f"  {row['original_term']} -> {row['concept_term']}")

print(f"\nTotal concepts to query: {len(CONCEPT_TERMS)}")


Total concepts to query: 400


## Set 1 – Prompt and Helpers

In [3]:
assert SET1_DIR.exists(), "SET1_DIR missing."
LOG_PATH_SET1 = SET1_DIR / "logs.txt"
MODEL_NAME = "gemini-1.5-pro"  # Gemini model
PROMPT_TEMPLATE_SET1 = """
You are acting as a SNOMED CT ontology browser.

Given the concept: "{CONCEPT_TERM}"

Return ONLY the following fields.
Use ONLY "is-a" taxonomic relationships.
Do NOT explain anything.

A1) FSN-style name (include semantic tag)
A2) Semantic tag
A3) Definition status (Primitive / Fully defined)
A4) Immediate parent concept(s) (depth -1)
A5) Grandparent concept(s) (depth -2, parents of parents)
A6) Immediate child concept(s) (depth +1)
A7) Near siblings (same parent)

Rules:
- Bullet lists for A4–A7
- Exact labels A1–A7
- No extra text
""".strip()

In [4]:
def _csv_safe(x):
    if x is None:
        return ""
    return str(x).replace("\r", " ").replace("\n", " ").strip()

A_LABELS = ["A1", "A2", "A3", "A4", "A5", "A6", "A7"]

def parse_A1_A7(raw: str) -> dict:
    text = (raw or "").replace("\r\n", "\n").replace("\r", "\n").strip()
    out = {k: "" for k in A_LABELS}
    label_re = re.compile(r"(?m)^\s*(A[1-7])\)\s*(.*)$")
    matches = list(label_re.finditer(text))
    if not matches:
        return out
    idx = {m.group(1): {"start": m.start(), "after": m.group(2).strip()} for m in matches}
    def section(label):
        if label not in idx:
            return ""
        start = idx[label]["start"]
        ends = [idx[k]["start"] for k in idx if idx[k]["start"] > start]
        end = min(ends) if ends else len(text)
        return text[start:end]
    for k in ["A1", "A2", "A3"]:
        val = idx[k]["after"] if k in idx else ""
        out[k] = val.strip() if val else ""
    bullet_re = re.compile(r"(?m)^\s*[-*•]\s+(.*)$")
    for k in ["A4", "A5", "A6", "A7"]:
        block = section(k)
        items = [m.group(1).strip() for m in bullet_re.finditer(block)]
        items = [i.replace("|", " ") for i in items]
        out[k] = "|".join(items) if items else ""
    return out

B_LABELS = ["B1", "B2", "B3", "B4", "B5", "B6", "B7"]

def parse_B1_B7(raw: str) -> dict:
    text = (raw or "").replace("\r\n", "\n").replace("\r", "\n").strip()
    out = {k: "" for k in B_LABELS}
    label_re = re.compile(r"(?m)^\s*(B[1-7])\)\s*(.*)$")
    matches = list(label_re.finditer(text))
    if not matches:
        return out
    idx = {m.group(1): {"start": m.start(), "after": m.group(2).strip()} for m in matches}
    def section(label):
        if label not in idx:
            return ""
        start = idx[label]["start"]
        ends = [idx[k]["start"] for k in idx if idx[k]["start"] > start]
        end = min(ends) if ends else len(text)
        return text[start:end]
    for k in ["B1", "B2", "B3"]:
        val = idx[k]["after"] if k in idx else ""
        out[k] = val.strip() if val else ""
    bullet_re = re.compile(r"(?m)^\s*[-*•]\s+(.*)$")
    for k in ["B4", "B5", "B6", "B7"]:
        block = section(k)
        items = [m.group(1).strip() for m in bullet_re.finditer(block)]
        items = [i.replace("|", " ") for i in items]
        out[k] = "|".join(items) if items else ""
    return out

## Initialize LLM Client

In [None]:
import google.generativeai as genai
import os

genai.configure(api_key=os.environ.get("GOOGLE_API_KEY"))
if not genai.api_key:
    raise EnvironmentError("GOOGLE_API_KEY is not set.")

model = genai.GenerativeModel(MODEL_NAME)
print("Gemini client initialized.")

  from .autonotebook import tqdm as notebook_tqdm

All support for the `google.generativeai` package has ended. It will no longer be receiving 
updates or bug fixes. Please switch to the `google.genai` package as soon as possible.
See README for more details:

https://github.com/google-gemini/deprecated-generative-ai-python/blob/main/README.md

  import google.generativeai as genai


AttributeError: module 'google.generativeai' has no attribute 'api_key'

## Set 1 – Resume and Process

In [None]:
if SET1_OUT.exists():
    existing_df_set1 = pd.read_csv(SET1_OUT, dtype=str).fillna("")
    done_terms_set1 = set(existing_df_set1["concept_term"].tolist())
else:
    existing_df_set1 = pd.DataFrame()
    done_terms_set1 = set()
print(f"Set 1 – concepts remaining: {len(CONCEPT_TERMS) - len(done_terms_set1)}")

In [None]:
rows_set1 = []

for concept_term in CONCEPT_TERMS:
    if concept_term in done_terms_set1:
        with LOG_PATH_SET1.open("a") as f:
            f.write(f"{datetime.now().isoformat()}\t{concept_term}\tSKIP\n")
        continue

    prompt = PROMPT_TEMPLATE_SET1.format(CONCEPT_TERM=concept_term)

    try:
        response = model.generate_content(prompt)
        raw_output = response.text or ""
        parsed = parse_A1_A7(raw_output)

        rows_set1.append({
            "timestamp": datetime.now().isoformat(),
            "model": MODEL_NAME,
            "prompt_set": "set1",
            "concept_term": concept_term,
            "A1_fsn": _csv_safe(parsed["A1"]),
            "A2_semantic_tag": _csv_safe(parsed["A2"]),
            "A3_definition_status": _csv_safe(parsed["A3"]),
            "A4_parents": _csv_safe(parsed["A4"]),
            "A5_grandparents": _csv_safe(parsed["A5"]),
            "A6_children": _csv_safe(parsed["A6"]),
            "A7_siblings": _csv_safe(parsed["A7"]),
        })

        with LOG_PATH_SET1.open("a") as f:
            f.write(
                "\n" + "="*80 + "\n" + datetime.now().isoformat() + "\n"
                + "CONCEPT: " + concept_term + "\n\n" + raw_output.strip() + "\n"
            )

    except Exception as e:
        with LOG_PATH_SET1.open("a") as f:
            f.write(f"{datetime.now().isoformat()}\t{concept_term}\tERROR\t{e}\n")

    time.sleep(0.2)

print(f"Set 1 processed {len(rows_set1)} new concepts.")

In [None]:
if rows_set1:
    out_df = pd.DataFrame(rows_set1)
    combined = pd.concat([existing_df_set1, out_df], ignore_index=True) if not existing_df_set1.empty else out_df
    combined = combined[["timestamp", "model", "prompt_set", "concept_term", "A1_fsn", "A2_semantic_tag", "A3_definition_status", "A4_parents", "A5_grandparents", "A6_children", "A7_siblings"]]
    combined.to_csv(SET1_OUT, index=False)
print("Set 1 complete.", SET1_OUT)

## Set 2 – Prompt, Resume and Process

In [None]:
assert SET2_DIR.exists(), "SET2_DIR missing."
LOG_PATH_SET2 = SET2_DIR / "logs.txt"
PROMPT_TEMPLATE_SET2 = """
For the term: "{CONCEPT_TERM}"

Answer ONLY with the items below.
Do NOT explain.

B1) Most precise official-style name
B2) What kind of thing it is (semantic type)
B3) Category type (Primitive / Fully defined)
B4) More general terms (immediate broader concepts)
B5) Grandparent terms (broader concepts two levels up)
B6) More specific terms (immediate narrower concepts)
B7) Terms at the same generality level (peers / siblings)

Rules:
- Bullet lists where applicable
- Exact labels B1–B7
- No extra text
""".strip()

In [None]:
if SET2_OUT.exists():
    existing_df_set2 = pd.read_csv(SET2_OUT, dtype=str).fillna("")
    done_terms_set2 = set(existing_df_set2["concept_term"].tolist())
else:
    existing_df_set2 = pd.DataFrame()
    done_terms_set2 = set()
print(f"Set 2 – concepts remaining: {len(CONCEPT_TERMS) - len(done_terms_set2)}")

In [None]:
rows_set2 = []

for concept_term in CONCEPT_TERMS:
    if concept_term in done_terms_set2:
        with LOG_PATH_SET2.open("a") as f:
            f.write(f"{datetime.now().isoformat()}\t{concept_term}\tSKIP\n")
        continue

    prompt = PROMPT_TEMPLATE_SET2.format(CONCEPT_TERM=concept_term)

    try:
        response = model.generate_content(prompt)
        raw_output = response.text or ""
        parsed = parse_B1_B7(raw_output)

        rows_set2.append({
            "timestamp": datetime.now().isoformat(),
            "model": MODEL_NAME,
            "prompt_set": "set2",
            "concept_term": concept_term,
            "B1_official_name": _csv_safe(parsed["B1"]),
            "B2_kind": _csv_safe(parsed["B2"]),
            "B3_category_type": _csv_safe(parsed["B3"]),
            "B4_immediate_broader": _csv_safe(parsed["B4"]),
            "B5_grandparents": _csv_safe(parsed["B5"]),
            "B6_immediate_narrower": _csv_safe(parsed["B6"]),
            "B7_peer_terms": _csv_safe(parsed["B7"]),
        })

        with LOG_PATH_SET2.open("a") as f:
            f.write(
                "\n" + "="*80 + "\n" + datetime.now().isoformat() + "\n"
                + "CONCEPT: " + concept_term + "\n\n" + raw_output.strip() + "\n"
            )

    except Exception as e:
        with LOG_PATH_SET2.open("a") as f:
            f.write(f"{datetime.now().isoformat()}\t{concept_term}\tERROR\t{e}\n")

    time.sleep(0.2)

print(f"Set 2 processed {len(rows_set2)} new concepts.")

In [None]:
if rows_set2:
    out_df = pd.DataFrame(rows_set2)
    combined = pd.concat([existing_df_set2, out_df], ignore_index=True) if not existing_df_set2.empty else out_df
    combined = combined[["timestamp", "model", "prompt_set", "concept_term", "B1_official_name", "B2_kind", "B3_category_type", "B4_immediate_broader", "B5_grandparents", "B6_immediate_narrower", "B7_peer_terms"]]
    combined.to_csv(SET2_OUT, index=False)
print("Set 2 complete.", SET2_OUT)
print("Run step3_accuracy.ipynb next.")