## API Ingestion 
This file describes how metadata from API endpoints can be ingested for retrieval.
It also provides code for enriching this metadata with LLM -> to generate summaries and keywords

In [None]:
### These code snippets can be added later to Ingestion_Ext.py 
import requests
import xml.etree.ElementTree as ET
from urllib.parse import urlparse
from datetime import datetime
import re

def extract_api_metadata(api_url):
    """
    Extracts metadata from PDOK / WFS / WMS / REST endpoints.
    Returns structured metadata or None if invalid/unavailable.
    """
    metadata = {"url": api_url, "type": "api_endpoint", "valid": False}
    try:
        # Normalize URL
        api_url = api_url.strip()
        if not re.match(r'^https?://', api_url):
            print(f"[WARN] Invalid URL format: {api_url}")
            metadata["error"] = "Invalid URL format"
            return metadata

        # Identify service type
        lower_url = api_url.lower()
        if "wfs" in lower_url:
            service_type = "WFS"
        elif "wms" in lower_url:
            service_type = "WMS"
        elif "wmts" in lower_url:
            service_type = "WMTS"
        else:
            service_type = "REST/Unknown"

        metadata["service_type"] = service_type

        # Try requesting capabilities for OGC services
        if service_type in ["WFS", "WMS", "WMTS"]:
            if "GetCapabilities" not in api_url:
                sep = "&" if "?" in api_url else "?"
                api_url = f"{api_url}{sep}service={service_type}&request=GetCapabilities"
            
            r = requests.get(api_url, timeout=10)
            if r.status_code != 200:
                metadata["error"] = f"HTTP {r.status_code}"
                return metadata

            xml_root = ET.fromstring(r.text)
            ns = {"wms": "http://www.opengis.net/wms", 
                  "wfs": "http://www.opengis.net/wfs", 
                  "ows": "http://www.opengis.net/ows"}

            # Extract general service info
            title = xml_root.find(".//ows:Title", ns)
            abstract = xml_root.find(".//ows:Abstract", ns)
            keywords = [kw.text for kw in xml_root.findall(".//ows:Keyword", ns)]

            metadata.update({
                "valid": True,
                "title": title.text if title is not None else None,
                "abstract": abstract.text if abstract is not None else None,
                "keywords": keywords,
                "date_ingested": datetime.now().isoformat(),
            })

            # Extract available layers (for WFS/WMS)
            layers = []
            for layer in xml_root.findall(".//wms:Layer", ns) or xml_root.findall(".//wfs:FeatureType", ns):
                lname = layer.find(".//wms:Name", ns) or layer.find(".//wfs:Name", ns)
                if lname is not None:
                    layers.append(lname.text)
            if layers:
                metadata["available_layers"] = layers[:10]

        else:
            # Fallback for REST APIs: try a GET and parse JSON
            r = requests.get(api_url, timeout=10)
            if r.status_code == 200 and "application/json" in r.headers.get("Content-Type", ""):
                data = r.json()
                metadata.update({
                    "valid": True,
                    "service_type": "REST",
                    "keys": list(data.keys())[:10]
                })
            else:
                metadata["error"] = f"Unrecognized response type: {r.headers.get('Content-Type')}"

    except Exception as e:
        metadata["error"] = str(e)
        print(f"[WARN] Failed to extract API metadata: {e}")
    return metadata


In [None]:
# Adding API ingestion to the existing pipeline (Includes some connection hints and modifications for merging with Ingestion_Ext.py)
def ingest_api_endpoints(persist_dir, api_urls):
    """
    Ingests API endpoints (e.g. PDOK WFS/WMS/REST) into ChromaDB
    with semantic enrichment.
    """
    os.makedirs(persist_dir, exist_ok=True)
    model = SentenceTransformer("all-MiniLM-L6-v2")
    client = PersistentClient(path=persist_dir)
    coll = client.get_or_create_collection("api_layers")

    docs, metas, ids = [], [], []

    for api_url in api_urls:
        meta = extract_api_metadata(api_url)
        if not meta.get("valid", False):
            print(f"[WARN] Skipping invalid API endpoint: {api_url}")
            continue

        # Enrich metadata semantically
        print(f"[INFO] Enriching API metadata for {meta.get('title', api_url)}")
        enriched = enrich_metadata(meta)
        if enriched:
            meta.update(enriched)

        doc_text = json.dumps(meta, indent=2)
        docs.append(doc_text)
        metas.append(meta)
        ids.append(make_id("api", api_url))

    # Prepare and store embeddings
    embeddings = model.encode(docs, convert_to_numpy=True).tolist()
    coll.add(documents=docs, metadatas=metas, ids=ids, embeddings=embeddings)
    print(f"[INFO] Ingested {len(docs)} valid API endpoints into 'api_layers'.")


In [None]:
# Adjustments to the main block 
p.add_argument("--api_endpoints", nargs="*", default=None, help="List of WFS/WMS/REST API endpoints to ingest")

if args.api_endpoints:
    ingest_api_endpoints(args.persist_dir, args.api_endpoints)

# While running on bash: Just simply use the same command + now the API endpoint