In [1]:
# Cell 1: Environment Setup and Configuration (Reworked)
# Status: Uses detailed dataclasses. Enforces EnhancedConfig usage. Enhanced secret loading.
#         Added Vector DB & Service Endpoint configs. Added MIZ OKI schema version.

import os
import json
import logging
import time
import uuid
import datetime
import re
import hashlib
from abc import ABC, abstractmethod
from collections import deque, defaultdict, Counter
from typing import Dict, Any, Optional, List, Union, Tuple, Set, Type, Callable, TypeVar, Protocol
from contextlib import contextmanager
from dataclasses import dataclass, field, asdict

# --- Data Handling ---
import numpy as np
import pandas as pd

# --- Cloud Provider SDKs ---
# Wrap imports in try-except to handle environments where they might be missing initially
try:
    from google.cloud import aiplatform
    from google.cloud import storage
    from google.cloud import bigquery
    from google.cloud import secretmanager # Import Secret Manager client
    from google.cloud import exceptions as gcp_exceptions
    GCP_SDK_AVAILABLE = True
except ImportError:
    GCP_SDK_AVAILABLE = False
    # Define dummy classes/exceptions if SDKs are missing
    class aiplatform: pass; class storage: pass; class bigquery: pass; class secretmanager: pass; class gcp_exceptions: class NotFound(Exception): pass; class PermissionDenied(Exception): pass; class GoogleAPIError(Exception): pass
    logging.warning("Google Cloud SDK components not found. Functionality will be limited.")


# --- Database/Graph (Check import, no connectivity test here) ---
try:
    from neo4j import GraphDatabase, basic_auth, exceptions as neo4j_exceptions
    NEO4J_AVAILABLE = True
except ImportError:
    NEO4J_AVAILABLE = False
    class GraphDatabase: pass; class basic_auth: pass; class neo4j_exceptions: pass # Dummy types
    logging.warning("Neo4j Python driver not found. Neo4j functionality unavailable.")

# --- Setup Logging (Early Configuration) ---
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
    datefmt='%Y-%m-%d %H:%M:%S'
)
logger = logging.getLogger('MIZ-OKI.Environment')

# --- Custom Exceptions ---
class ConfigurationError(Exception):
    """Custom exception for configuration errors"""
    pass

# --- Configuration Dataclasses ---

@dataclass
class Neo4jConfig:
    """Neo4j database configuration"""
    uri: str = field(default_factory=lambda: os.getenv("NEO4J_URI", "neo4j+s://localhost:7687"))
    user: str = field(default_factory=lambda: os.getenv("NEO4J_USER", "neo4j"))
    password: Optional[str] = field(default=None) # Loaded later
    max_connection_lifetime: int = 3600
    connection_timeout: int = 30
    max_retry_time: int = 30

@dataclass
class DatabaseConfig:
    """Database configuration settings"""
    neo4j: Neo4jConfig = field(default_factory=Neo4jConfig)
    bigquery_project_id: Optional[str] = field(default_factory=lambda: os.getenv("GOOGLE_CLOUD_PROJECT"))
    bigquery_dataset_id: str = field(default_factory=lambda: os.getenv("BQ_DATASET", "miz3_data"))
    storage_bucket: Optional[str] = field(default_factory=lambda: os.getenv("GCS_BUCKET_NAME"))

@dataclass
class SecretManagerConfig:
    """Configuration for accessing secrets via Google Secret Manager."""
    enabled: bool = field(default_factory=lambda: os.getenv("USE_SECRET_MANAGER", "true").lower() == "true")
    project_id: Optional[str] = None # Defaults to main GCP project if None
    neo4j_uri_secret: Optional[str] = field(default_factory=lambda: os.environ.get("NEO4J_URI_SECRET_ID"))
    neo4j_user_secret: Optional[str] = field(default_factory=lambda: os.environ.get("NEO4J_USER_SECRET_ID"))
    neo4j_password_secret: Optional[str] = field(default_factory=lambda: os.environ.get("NEO4J_PASSWORD_SECRET_ID"))
    miz_salt_secret: Optional[str] = field(default_factory=lambda: os.environ.get("MIZ_SALT_SECRET_ID"))
    openai_api_key_secret: Optional[str] = field(default_factory=lambda: os.environ.get("OPENAI_API_KEY_SECRET_ID"))
    anthropic_api_key_secret: Optional[str] = field(default_factory=lambda: os.environ.get("ANTHROPIC_API_KEY_SECRET_ID"))
    # Add other secrets as needed (e.g., external API keys, service endpoint keys)
    kg_tool_api_key_secret: Optional[str] = field(default_factory=lambda: os.environ.get("KG_TOOL_API_KEY_SECRET_ID"))
    moe_registry_api_key_secret: Optional[str] = field(default_factory=lambda: os.environ.get("MOE_REGISTRY_API_KEY_SECRET_ID"))

@dataclass
class GcpConfig:
    project_id: Optional[str] = field(default_factory=lambda: os.environ.get("GOOGLE_CLOUD_PROJECT"))
    region: str = field(default_factory=lambda: os.environ.get("GOOGLE_CLOUD_REGION", "us-central1"))
    gcs_bucket_name: Optional[str] = None # Derived later
    bq_dataset: str = "miz3_data"
    bq_dataset_location: str = "US" # Or derive from region
    secrets: SecretManagerConfig = field(default_factory=SecretManagerConfig)

@dataclass
class VertexAIConfig:
    # Paths for storing tool/agent configurations used by Agent Builder/Workflows
    agent_builder_tool_config_gcs_path: Optional[str] = None # Derived later
    # Default Workflow IDs for core processes
    default_workflow_id: str = field(default_factory=lambda: os.getenv("DEFAULT_WORKFLOW_ID", "miz3-core-cycle"))
    planning_workflow_id: str = field(default_factory=lambda: os.getenv("PLANNING_WORKFLOW_ID", "miz3-goal-planning"))
    experiment_execution_workflow_id: str = field(default_factory=lambda: os.getenv("EXPERIMENT_EXEC_WORKFLOW_ID", "miz3-experiment-execution"))
    experiment_analysis_workflow_id: str = field(default_factory=lambda: os.getenv("EXPERIMENT_ANALYSIS_WORKFLOW_ID", "miz3-experiment-analysis"))
    # API endpoint for interacting with Workflows API
    workflows_api_endpoint: Optional[str] = None # Derived later
    # Service account used by Workflow executions to call other services/APIs
    workflow_service_account: Optional[str] = None # Derived later
    # Agent Engine specific configurations (if using Agent Engine runtime)
    agent_engine_network: Optional[str] = field(default_factory=lambda: os.getenv("AGENT_ENGINE_NETWORK")) # VPC Network for Agent Engine
    agent_engine_security_policy: Optional[str] = field(default_factory=lambda: os.getenv("AGENT_ENGINE_SECURITY_POLICY")) # Security policy (e.g., IAP)
    # Vertex AI Search configuration (for RAG grounding)
    search_default_datastore_id: Optional[str] = field(default_factory=lambda: os.environ.get("VERTEX_SEARCH_DATASTORE_ID"))
    # Default model for simple tasks within Vertex AI context
    default_model: str = field(default_factory=lambda: os.getenv("VERTEX_DEFAULT_MODEL", "gemini-1.5-flash-001"))

@dataclass
class AdkConfig:
    # Default runtime environment for ADK agents (if not using Agent Engine)
    default_agent_runtime: str = field(default_factory=lambda: os.getenv("ADK_DEFAULT_RUNTIME", "cloud_run")) # 'cloud_run' or 'cloud_functions'
    # Default service account for deployed ADK agents
    default_agent_service_account: Optional[str] = None # Derived later
    # Base container image for ADK agents
    default_agent_base_image: str = field(default_factory=lambda: os.getenv("ADK_BASE_IMAGE", "python:3.10-slim"))
    # How ADK tools are registered/discovered (GCS file or dedicated API)
    tool_registry_type: str = field(default_factory=lambda: os.getenv("ADK_TOOL_REGISTRY_TYPE", "gcs")) # 'gcs' or 'api'
    # Location of the tool registry (GCS path or API endpoint)
    tool_registry_location: Optional[str] = None # Derived later

@dataclass
class KgConfig:
    # Primary storage for graph structure (Neo4j or potentially others)
    storage_type: str = field(default_factory=lambda: os.getenv("KG_STORAGE_TYPE", "neo4j").lower())
    neo4j: Neo4jConfig = field(default_factory=Neo4jConfig)
    # Performance/Scale parameters
    memory_efficiency: float = field(default_factory=lambda: float(os.getenv("KG_MEMORY_EFFICIENCY", "0.85")))
    entity_resolution_accuracy: float = field(default_factory=lambda: float(os.getenv("KG_ENTITY_RESOLUTION_ACCURACY", "0.995")))
    cache_ttl_minutes: int = field(default_factory=lambda: int(os.getenv("KG_CACHE_TTL_MINUTES", "5")))
    # Vector Database configuration
    vector_db_type: str = field(default_factory=lambda: os.getenv("VECTOR_DB_TYPE", "vertex_vector_search").lower()) # 'vertex_vector_search', 'neo4j', 'none'
    vector_index_name: str = field(default_factory=lambda: os.getenv("VECTOR_INDEX_NAME", "miz3_entity_embeddings"))
    vector_dimensions: int = field(default_factory=lambda: int(os.getenv("VECTOR_DIMENSIONS", "768"))) # Match embedding model
    # Specifics for Vertex AI Vector Search (Matching Engine)
    vector_db_endpoint: Optional[str] = field(default_factory=lambda: os.getenv("VERTEX_VECTOR_INDEX_ENDPOINT_NAME")) # Resource name: projects/.../indexEndpoints/...
    vector_db_public_domain_name: Optional[str] = field(default_factory=lambda: os.getenv("VERTEX_VECTOR_ENDPOINT_DOMAIN")) # Public endpoint domain if needed for direct access

@dataclass
class FoundationModelDefaults:
    # Default models for different providers/tasks
    vertex: str = field(default_factory=lambda: os.getenv("VERTEX_DEFAULT_MODEL", "gemini-1.5-flash-001"))
    llama4_scout: str = field(default_factory=lambda: os.getenv("LLAMA4_SCOUT_MODEL", "llama3-8b-instruct")) # Example mapping
    llama4_maverick: str = field(default_factory=lambda: os.getenv("LLAMA4_MAVERICK_MODEL", "llama3-70b-instruct")) # Example mapping
    llama4_embedding_model: str = field(default_factory=lambda: os.getenv("LLAMA4_EMBEDDING_MODEL", "text-embedding-004")) # Example mapping
    openai: str = field(default_factory=lambda: os.getenv("OPENAI_DEFAULT_MODEL", "gpt-4-turbo"))
    anthropic: str = field(default_factory=lambda: os.getenv("ANTHROPIC_DEFAULT_MODEL", "claude-3-5-sonnet-20240620"))
    # Specific models for internal tasks
    xai_counterfactual_model: str = field(default_factory=lambda: os.getenv("XAI_COUNTERFACTUAL_MODEL", "llama4_maverick")) # Alias used by XAI component
    feedback_analyzer_model: str = field(default_factory=lambda: os.getenv("CV_FEEDBACK_ANALYZER_MODEL", "llama4_scout")) # Alias used by CV component

@dataclass
class FoundationModelPricingEntry:
    prompt: float = 0.0 # Price per million tokens
    completion: float = 0.0 # Price per million tokens

@dataclass
class FoundationModelConfig:
    # Stores API keys loaded from secrets/env vars
    keys: Dict[str, Optional[str]] = field(default_factory=dict)
    defaults: FoundationModelDefaults = field(default_factory=FoundationModelDefaults)
    # Pricing information (can be loaded from external source or hardcoded)
    pricing: Dict[str, Dict[str, FoundationModelPricingEntry]] = field(default_factory=lambda: {
        "vertex": {
            "gemini-1.5-flash-001": FoundationModelPricingEntry(prompt=0.35, completion=0.70),
            "gemini-1.5-pro-001": FoundationModelPricingEntry(prompt=3.50, completion=10.50),
            "llama3-8b-instruct": FoundationModelPricingEntry(prompt=0.50, completion=0.50), # Example pricing
            "llama3-70b-instruct": FoundationModelPricingEntry(prompt=2.65, completion=2.65), # Example pricing
            "text-embedding-004": FoundationModelPricingEntry(prompt=0.025, completion=0.0), # Example pricing
        },
        "openai": {
            "gpt-4-turbo": FoundationModelPricingEntry(prompt=10.00, completion=30.00),
            "gpt-3.5-turbo": FoundationModelPricingEntry(prompt=0.50, completion=1.50),
        },
        "anthropic": {
            "claude-3-5-sonnet-20240620": FoundationModelPricingEntry(prompt=3.00, completion=15.00),
            "claude-3-opus-20240229": FoundationModelPricingEntry(prompt=15.00, completion=75.00),
        }
    })

@dataclass
class BusinessImpactConfig:
    # Load KPI definitions and targets (e.g., from JSON env var or config file)
    kpis: Dict[str, Dict[str, Any]] = field(default_factory=lambda: json.loads(os.getenv("MIZ_KPIS", '{}')))
    # Specific targets mentioned in whitepaper
    roas_target: float = field(default_factory=lambda: float(os.getenv("MIZ_ROAS_TARGET", "9.0"))) # Target 8-10x
    cac_reduction_target_factor: float = field(default_factory=lambda: float(os.getenv("MIZ_CAC_TARGET_FACTOR", "0.40"))) # Target 55-65% reduction -> factor ~0.4
    clv_increase_target_factor: float = field(default_factory=lambda: float(os.getenv("MIZ_CLV_TARGET_FACTOR", "1.50"))) # Target 45-55% increase -> factor ~1.5

@dataclass
class SystemThresholdsConfig:
    # Confidence thresholds for automated vs. human review
    decision_confidence_threshold: float = field(default_factory=lambda: float(os.getenv("DECISION_CONF_THRESHOLD", "0.85")))
    human_review_confidence_threshold: float = field(default_factory=lambda: float(os.getenv("HUMAN_REVIEW_CONF_THRESHOLD", "0.75")))
    # Thresholds for triggering optimization or goal generation
    optimization_threshold: float = field(default_factory=lambda: float(os.getenv("OPTIMIZATION_THRESHOLD", "0.7"))) # Score below which PO triggers HDE
    goal_generation_threshold: float = field(default_factory=lambda: float(os.getenv("GOAL_GEN_THRESHOLD", "0.6"))) # Score below which AGG generates goal

@dataclass
class XaiConfig:
    # Where XAI decision logs are stored ('kg' via API, 'log_file' via Cloud Logging)
    storage_type: str = field(default_factory=lambda: os.getenv("XAI_STORAGE_TYPE", "kg").lower())
    # Log name for Cloud Logging (if storage_type is 'log_file')
    log_name: str = field(default_factory=lambda: os.getenv("XAI_LOG_NAME", "miz_oki_xai_decisions"))
    # Alias for the model used for counterfactual explanations
    counterfactual_model_alias: str = field(default_factory=lambda: os.getenv("XAI_COUNTERFACTUAL_MODEL_ALIAS", "llama4_maverick"))

@dataclass
class LearningFlowsConfig:
    # Knowledge Distillation settings
    kd: Dict[str, Any] = field(default_factory=lambda: {
        "teacher_model_alias": os.getenv("KD_TEACHER_MODEL_ALIAS", "llama4_maverick"),
        "default_student_architecture": os.getenv("KD_DEFAULT_STUDENT_ARCH", "distilbert-base-uncased"),
        "output_gcs_prefix": "kd_outputs/" # Relative to bucket root
    })
    # Continuous Validation settings
    cv: Dict[str, Any] = field(default_factory=lambda: {
        "feedback_queue_maxsize": int(os.getenv("CV_QUEUE_MAXSIZE", "1000")),
        "monitoring_interval_seconds": int(os.getenv("CV_MONITOR_INTERVAL_SEC", "300")), # 5 minutes
        "drift_detection_threshold": float(os.getenv("CV_DRIFT_THRESHOLD", "0.1")),
        "bias_detection_threshold": float(os.getenv("CV_BIAS_THRESHOLD", "0.05")),
        "feedback_analyzer_model_alias": os.getenv("CV_FEEDBACK_ANALYZER_MODEL_ALIAS", "feedback_analyzer_model") # Use alias from FM defaults
    })
    # Dynamic Reward System settings
    drs: Dict[str, Any] = field(default_factory=lambda: {
        "base_weights": json.loads(os.getenv("DRS_BASE_WEIGHTS", '{"task_completion": 1.0, "efficiency": 0.5, "quality": 0.8}')),
        "objective_influence_factor": float(os.getenv("DRS_OBJECTIVE_FACTOR", "0.3")),
        "update_interval_seconds": int(os.getenv("DRS_UPDATE_INTERVAL_SEC", "600")) # 10 minutes
    })
    # Distributed Reinforcement Learning settings
    drl: Dict[str, Any] = field(default_factory=lambda: {
        "buffer_size": int(os.getenv("DRL_BUFFER_SIZE", "50000")),
        "buffer_save_interval_sec": int(os.getenv("DRL_SAVE_INTERVAL_SEC", "600")), # 10 minutes
        "min_buffer_for_train": int(os.getenv("DRL_MIN_BUFFER_TRAIN", "1000")),
        "buffer_gcs_prefix": "rl_buffer/" # Relative to bucket root
    })

@dataclass
class ServiceEndpointsConfig:
    """Configuration for dependent internal service endpoints."""
    # Endpoint for the deployed KG Tool service (Cell 3)
    kg_tool_api_endpoint: Optional[str] = field(default_factory=lambda: os.getenv("KG_TOOL_API_ENDPOINT")) # e.g., Cloud Run URL
    # Endpoint for the deployed MoE Registry service (Cell 4)
    moe_registry_api_endpoint: Optional[str] = field(default_factory=lambda: os.getenv("MOE_REGISTRY_API_ENDPOINT")) # e.g., Cloud Run URL
    # Endpoint for the deployed Expert Invoker service (used by HDE, PO, HP, etc.)
    expert_invoker_api_endpoint: Optional[str] = field(default_factory=lambda: os.getenv("EXPERT_INVOKER_API_ENDPOINT"))
    # Add other internal service endpoints as needed (e.g., Causal Tool, Sim Tool, External API Tools)
    ads_platform_api_endpoint: Optional[str] = field(default_factory=lambda: os.getenv("ADS_PLATFORM_API_ENDPOINT"))
    crm_api_endpoint: Optional[str] = field(default_factory=lambda: os.getenv("CRM_API_ENDPOINT"))

# --- Main Configuration Class (Reworked) ---
@dataclass
class EnhancedConfig:
    """ Root configuration object for MIZ OKI 3.0 Platform. """
    miz_oki_schema_version: str = "3.0" # Added schema version
    gcp: GcpConfig = field(default_factory=GcpConfig)
    db: DatabaseConfig = field(default_factory=DatabaseConfig)
    vertex_ai: VertexAIConfig = field(default_factory=VertexAIConfig)
    adk: AdkConfig = field(default_factory=AdkConfig)
    kg: KgConfig = field(default_factory=KgConfig)
    foundation_models: FoundationModelConfig = field(default_factory=FoundationModelConfig)
    business_impact: BusinessImpactConfig = field(default_factory=BusinessImpactConfig)
    system_thresholds: SystemThresholdsConfig = field(default_factory=SystemThresholdsConfig)
    xai: XaiConfig = field(default_factory=XaiConfig)
    learning_flows: LearningFlowsConfig = field(default_factory=LearningFlowsConfig)
    service_endpoints: ServiceEndpointsConfig = field(default_factory=ServiceEndpointsConfig) # Added service endpoints

    # Top-level settings / Other service configs
    mlops_pipeline_root: Optional[str] = None # Derived
    mlops_serving_image: str = field(default_factory=lambda: os.getenv("MLOPS_SERVING_IMAGE", "us-docker.pkg.dev/vertex-ai/prediction/tf2-cpu.2.11:latest")) # Example
    mlops_trigger_topic: str = field(default_factory=lambda: os.getenv("MLOPS_TRIGGER_TOPIC", "miz3-mlops-trigger"))
    mlops_rl_train_topic: str = field(default_factory=lambda: os.getenv("MLOPS_RL_TRAIN_TOPIC", "miz3-mlops-rl-train"))
    # Dead Letter Queue target ('log_only', 'pubsub_topic', 'gcs_path')
    dlq_target: str = field(default_factory=lambda: os.getenv("DLQ_TARGET", "log_only"))
    # Salt for pseudonymization (loaded from Secret Manager or Env Var)
    miz_salt: Optional[str] = None
    # Configuration for external data sources used by AKA (Cell 4)
    external_data_sources: Dict[str, Optional[str]] = field(default_factory=lambda: json.loads(os.getenv("EXTERNAL_DATA_SOURCES", '{}'))) # e.g., {"news_api_key_secret": "NEWS_API_KEY_SECRET_ID"}
    # Privacy policies used by PrivacyControlsTool (Cell 7)
    privacy_policies: Dict[str, Dict[str, Any]] = field(default_factory=lambda: json.loads(os.getenv("PRIVACY_POLICIES", '{"default": {"requires_pseudonymization": true, "allowed_fields": null}}')))
    # Objectives used by HolisticOptimizerTool (Cell 5)
    optimizer_objectives: Dict[str, Dict[str, Any]] = field(default_factory=lambda: json.loads(os.getenv("OPTIMIZER_OBJECTIVES", '{}'))) # e.g., {"ROAS": {"target": 8.0, "weight": 0.6}}
    # Forecasting models used by HolisticOptimizerTool (Cell 5)
    optimizer_forecasting_models: Dict[str, str] = field(default_factory=lambda: json.loads(os.getenv("OPTIMIZER_FORECASTERS", '{}'))) # e.g., {"ROAS": "roas_forecaster_v1"}
    # Parameters for BEAB Tool (Cell 7)
    beab_equity_weight: float = field(default_factory=lambda: float(os.getenv("BEAB_EQUITY_WEIGHT", "0.2")))
    rtb_min_bid_threshold: float = field(default_factory=lambda: float(os.getenv("RTB_MIN_BID", "0.01")))

    def __post_init__(self):
        """ Validate critical values and derive defaults after loading from environment. """
        logger.debug("Running EnhancedConfig __post_init__ ...")
        if not self.gcp.project_id:
            raise ConfigurationError("CRITICAL: GOOGLE_CLOUD_PROJECT environment variable not set.")

        # Derive dependent GCP settings
        if not self.db.bigquery_project_id: self.db.bigquery_project_id = self.gcp.project_id
        if not self.gcp.gcs_bucket_name:
            self.gcp.gcs_bucket_name = f"{self.gcp.project_id}-miz3-data"
            logger.info(f"GCS Bucket Name derived: {self.gcp.gcs_bucket_name}")
        if not self.db.storage_bucket: self.db.storage_bucket = self.gcp.gcs_bucket_name

        if self.gcp.gcs_bucket_name:
            self.mlops_pipeline_root = self.mlops_pipeline_root or f"gs://{self.gcp.gcs_bucket_name}/miz3_pipelines"
            self.vertex_ai.agent_builder_tool_config_gcs_path = self.vertex_ai.agent_builder_tool_config_gcs_path or f"gs://{self.gcp.gcs_bucket_name}/config/vertex_tools.json"
            if self.adk.tool_registry_type == 'gcs':
                self.adk.tool_registry_location = self.adk.tool_registry_location or f"gs://{self.gcp.gcs_bucket_name}/config/adk_tools_registry.json"

        # Derive API endpoints and service accounts
        self.vertex_ai.workflows_api_endpoint = self.vertex_ai.workflows_api_endpoint or f"{self.gcp.region}-workflows.googleapis.com"
        def_sa_suffix = f"@{self.gcp.project_id}.iam.gserviceaccount.com"
        self.vertex_ai.workflow_service_account = self.vertex_ai.workflow_service_account or f"miz-workflow-runner{def_sa_suffix}"
        self.adk.default_agent_service_account = self.adk.default_agent_service_account or f"miz-adk-agent{def_sa_suffix}"

        # Resolve model aliases used internally after defaults are set
        self.xai.counterfactual_model_alias = self.foundation_models.defaults.xai_counterfactual_model
        self.learning_flows.cv['feedback_analyzer_model_alias'] = self.foundation_models.defaults.feedback_analyzer_model
        self.learning_flows.kd['teacher_model_alias'] = self.foundation_models.defaults.llama4_maverick # Example, could be configurable

        logger.debug("EnhancedConfig basic post-init derivation complete.")

    def get(self, key: str, default: Any = None) -> Any:
        """Provides dictionary-like access, navigating nested dataclasses using dot notation."""
        try:
            value = self
            keys = key.split('.')
            for k in keys:
                if isinstance(value, dict):
                    value = value.get(k)
                elif hasattr(value, k):
                    value = getattr(value, k)
                else:
                    return default # Key part not found
                if value is None: # Stop if any part resolves to None
                    return default
            return value
        except (AttributeError, KeyError, TypeError):
            return default

    def get_model_info(self, model_alias_or_id: str) -> Optional[Dict[str, Any]]:
        """ Resolves an alias (from defaults) or ID to get provider, actual model ID, and pricing. """
        if not model_alias_or_id: return None
        resolved_id = model_alias_or_id
        provider = None

        # Check if input matches an alias key in defaults dataclass
        if hasattr(self.foundation_models.defaults, model_alias_or_id):
            resolved_id = getattr(self.foundation_models.defaults, model_alias_or_id)
            logger.debug(f"Resolved alias '{model_alias_or_id}' to model ID '{resolved_id}'.")
        else:
            # Check if input matches an alias *value* (less common but possible)
            for field_name in self.foundation_models.defaults.__dataclass_fields__:
                 if getattr(self.foundation_models.defaults, field_name) == model_alias_or_id:
                      resolved_id = model_alias_or_id # It was already the resolved ID
                      logger.debug(f"Input '{model_alias_or_id}' matches a default model ID directly.")
                      break
            # If still not resolved, assume input IS the model_id
            model_id = resolved_id
            logger.debug(f"Assuming '{model_alias_or_id}' is the final model ID.")

        model_id = resolved_id # Final model ID

        # Determine provider based on model ID patterns or config keys
        if model_id:
            if any(p in model_id for p in ["gemini", "llama3", "text-embedding"]): provider = "vertex"
            elif "gpt-" in model_id: provider = "openai"
            elif "claude-" in model_id: provider = "anthropic"
            # Add more provider patterns if needed

        if not provider: # Fallback based on available keys if pattern matching fails
            if "vertex" in self.foundation_models.keys: provider = "vertex"
            elif "openai" in self.foundation_models.keys: provider = "openai"
            elif "anthropic" in self.foundation_models.keys: provider = "anthropic"

        if not provider:
            logger.warning(f"Could not determine provider for '{model_alias_or_id}' (resolved to '{model_id}').")
            return None

        # Look up pricing
        provider_pricing = self.foundation_models.pricing.get(provider, {})
        model_pricing_entry = provider_pricing.get(model_id)
        pricing_dict = asdict(model_pricing_entry) if model_pricing_entry else None
        if not pricing_dict and model_id:
            logger.warning(f"Pricing not found for model '{model_id}' under provider '{provider}'.")

        return {"provider": provider, "model_id": model_id, "pricing": pricing_dict}

# --- Helper Function to Load Secrets (Enhanced Error Handling) ---
def _load_secret(secret_id: Optional[str], project_id: Optional[str] = None) -> Optional[str]:
    """Loads a secret from Google Secret Manager. Returns None if failed or not configured."""
    if not secret_id:
        # logger.debug("Secret ID is None or empty, skipping load.")
        return None
    if not GCP_SDK_AVAILABLE or not hasattr(secretmanager, 'SecretManagerServiceClient'):
        logger.warning("Secret Manager SDK not available. Cannot load secrets.")
        return None

    try:
        client = secretmanager.SecretManagerServiceClient()
        if '/' not in secret_id:
            if not project_id:
                raise ValueError("Project ID needed for short secret names.")
            # Assume short name is just the secret ID, construct full name
            secret_name = f"projects/{project_id}/secrets/{secret_id}/versions/latest"
        elif '/versions/' not in secret_id:
            # Assume name is projects/.../secrets/SECRET_ID, add version
            secret_name = f"{secret_id}/versions/latest"
        else:
            # Assume full version name is provided
            secret_name = secret_id

        logger.debug(f"Accessing secret: {secret_name}")
        response = client.access_secret_version(name=secret_name)
        payload = response.payload.data.decode("UTF-8")
        logger.debug(f"Successfully accessed secret: {secret_id} (short name or full path)")
        return payload
    except gcp_exceptions.NotFound:
        logger.error(f"Secret not found: {secret_id} (Full path tried: {secret_name})")
        return None
    except gcp_exceptions.PermissionDenied:
        logger.error(f"Permission denied accessing secret: {secret_id}. Check SA permissions for {secret_name}.")
        return None
    except ValueError as ve:
        logger.error(f"Value error loading secret '{secret_id}': {ve}") # e.g., missing project ID
        return None
    except Exception as e:
        # Catch any other unexpected exceptions during secret access
        logger.error(f"Failed to load secret '{secret_id}' from {secret_name}: {type(e).__name__} - {e}", exc_info=True)
        return None

# --- Config Loading Function (Reworked) ---
def load_configuration() -> EnhancedConfig:
    """ Loads configuration from environment variables and Secret Manager, returning EnhancedConfig. """
    logger.info("Loading MIZ 3.0 OKI configuration (Reworked Cell 1)...")
    try:
        # Step 1: Instantiate EnhancedConfig - loads defaults from environment variables
        cfg = EnhancedConfig()
    except ConfigurationError as e:
        logger.critical(f"Initial config validation failed during __post_init__: {e}", exc_info=True)
        raise
    except Exception as e:
        logger.critical(f"Critical error during initial config instantiation: {e}", exc_info=True)
        raise ConfigurationError(f"Failed to instantiate EnhancedConfig: {e}") from e

    # Step 2: Load secrets if enabled
    secret_proj_id = cfg.gcp.secrets.project_id or cfg.gcp.project_id # Use specific secret project ID if set
    if cfg.gcp.secrets.enabled:
        if not GCP_SDK_AVAILABLE:
            logger.warning("Secret Manager is enabled in config, but SDK is not available. Skipping secret loading.")
        else:
            logger.info(f"Attempting to load secrets from Google Secret Manager (Project: {secret_proj_id})")
            # --- Neo4j Secrets ---
            cfg.db.neo4j.uri = _load_secret(cfg.gcp.secrets.neo4j_uri_secret, secret_proj_id) or cfg.db.neo4j.uri
            cfg.db.neo4j.user = _load_secret(cfg.gcp.secrets.neo4j_user_secret, secret_proj_id) or cfg.db.neo4j.user
            cfg.db.neo4j.password = _load_secret(cfg.gcp.secrets.neo4j_password_secret, secret_proj_id) or os.getenv("NEO4J_PASSWORD") # Fallback to env var

            # --- MIZ Salt ---
            cfg.miz_salt = _load_secret(cfg.gcp.secrets.miz_salt_secret, secret_proj_id) or os.getenv("MIZ_SALT")

            # --- Foundation Model API Keys ---
            cfg.foundation_models.keys['openai'] = _load_secret(cfg.gcp.secrets.openai_api_key_secret, secret_proj_id) or os.getenv("OPENAI_API_KEY")
            cfg.foundation_models.keys['anthropic'] = _load_secret(cfg.gcp.secrets.anthropic_api_key_secret, secret_proj_id) or os.getenv("ANTHROPIC_API_KEY")

            # --- Service Endpoint Secrets (e.g., API Keys for internal tools) ---
            # Example: Load API key for KG Tool if secured via API Key
            # cfg.service_endpoints.kg_tool_api_key = _load_secret(cfg.gcp.secrets.kg_tool_api_key_secret, secret_proj_id) or os.getenv("KG_TOOL_API_KEY")
            # Example: Load API key for MoE Registry if secured via API Key
            # cfg.service_endpoints.moe_registry_api_key = _load_secret(cfg.gcp.secrets.moe_registry_api_key_secret, secret_proj_id) or os.getenv("MOE_REGISTRY_API_KEY")

            # Load service endpoints themselves if they are stored in secrets (less common, usually env vars)
            cfg.service_endpoints.kg_tool_api_endpoint = _load_secret(os.getenv("KG_TOOL_API_ENDPOINT_SECRET_ID"), secret_proj_id) or cfg.service_endpoints.kg_tool_api_endpoint
            cfg.service_endpoints.moe_registry_api_endpoint = _load_secret(os.getenv("MOE_REGISTRY_API_ENDPOINT_SECRET_ID"), secret_proj_id) or cfg.service_endpoints.moe_registry_api_endpoint
            cfg.service_endpoints.expert_invoker_api_endpoint = _load_secret(os.getenv("EXPERT_INVOKER_API_ENDPOINT_SECRET_ID"), secret_proj_id) or cfg.service_endpoints.expert_invoker_api_endpoint
            cfg.service_endpoints.ads_platform_api_endpoint = _load_secret(os.getenv("ADS_PLATFORM_API_ENDPOINT_SECRET_ID"), secret_proj_id) or cfg.service_endpoints.ads_platform_api_endpoint
            cfg.service_endpoints.crm_api_endpoint = _load_secret(os.getenv("CRM_API_ENDPOINT_SECRET_ID"), secret_proj_id) or cfg.service_endpoints.crm_api_endpoint

    else:
        logger.warning("Secret Manager not enabled. Loading secrets/endpoints from environment variables ONLY.")
        # Load sensitive values directly from environment variables as fallback
        cfg.db.neo4j.password = os.getenv("NEO4J_PASSWORD")
        cfg.miz_salt = os.getenv("MIZ_SALT")
        cfg.foundation_models.keys['openai'] = os.getenv("OPENAI_API_KEY")
        cfg.foundation_models.keys['anthropic'] = os.getenv("ANTHROPIC_API_KEY")
        # cfg.service_endpoints.kg_tool_api_key = os.getenv("KG_TOOL_API_KEY")
        # cfg.service_endpoints.moe_registry_api_key = os.getenv("MOE_REGISTRY_API_KEY")
        # Endpoints typically still come from env vars even if secrets are disabled
        cfg.service_endpoints.kg_tool_api_endpoint = cfg.service_endpoints.kg_tool_api_endpoint # Already loaded from env by default
        cfg.service_endpoints.moe_registry_api_endpoint = cfg.service_endpoints.moe_registry_api_endpoint
        cfg.service_endpoints.expert_invoker_api_endpoint = cfg.service_endpoints.expert_invoker_api_endpoint
        cfg.service_endpoints.ads_platform_api_endpoint = cfg.service_endpoints.ads_platform_api_endpoint
        cfg.service_endpoints.crm_api_endpoint = cfg.service_endpoints.crm_api_endpoint


    # Step 3: Finalize loaded keys/auth
    # Remove providers with no keys/auth loaded
    cfg.foundation_models.keys = {k:v for k, v in cfg.foundation_models.keys.items() if v is not None}
    # Add 'vertex' key if GCP project is set (implies SDK auth is possible)
    if cfg.gcp.project_id:
        cfg.foundation_models.keys['vertex'] = "gcp_authenticated"

    # --- Final Validation ---
    logger.debug("Performing final configuration validation...")
    critical_errors = []
    warnings = []
    try:
        # Critical: MIZ Salt
        if not cfg.miz_salt or cfg.miz_salt == "default_insecure_salt_replace_me_!!":
            critical_errors.append("MIZ_SALT missing or insecure. Set MIZ_SALT env var or MIZ_SALT_SECRET_ID.")

        # Critical: Neo4j Config (if used)
        if cfg.kg.storage_type == "neo4j":
            if not all([cfg.db.neo4j.uri, cfg.db.neo4j.user, cfg.db.neo4j.password]):
                critical_errors.append("Neo4j configured (kg.storage_type='neo4j') but connection details (URI, User, Password) missing. Check NEO4J_* env vars or secret IDs.")
            if cfg.db.neo4j.password == "password":
                warnings.append("SECURITY ALERT: Using default 'password' for Neo4j!")
            if not NEO4J_AVAILABLE:
                critical_errors.append("Neo4j configured but 'neo4j' library not installed. Run 'pip install neo4j'.")

        # Warning: Vector DB Endpoint (if used)
        if cfg.kg.vector_db_type == "vertex_vector_search" and not cfg.kg.vector_db_endpoint:
            warnings.append("Vertex Vector Search configured (kg.vector_db_type='vertex_vector_search') but endpoint name (VERTEX_VECTOR_INDEX_ENDPOINT_NAME env var) missing.")

        # Warning: Foundation Model Keys
        if not cfg.foundation_models.keys:
            warnings.append("No Foundation Model API keys or authentication methods loaded. Check *_API_KEY env vars or secret IDs.")

        # Warning: Required Service Endpoints (adjust based on actual dependencies)
        if not cfg.service_endpoints.kg_tool_api_endpoint:
            warnings.append("KG Tool API endpoint (KG_TOOL_API_ENDPOINT env var or secret ID) not configured. KG interactions via API will fail.")
        if not cfg.service_endpoints.moe_registry_api_endpoint:
            warnings.append("MoE Registry API endpoint (MOE_REGISTRY_API_ENDPOINT env var or secret ID) not configured. Dynamic expert routing will fail.")
        if not cfg.service_endpoints.expert_invoker_api_endpoint:
            warnings.append("Expert Invoker API endpoint (EXPERT_INVOKER_API_ENDPOINT env var or secret ID) not configured. Calls to expert models will fail.")

        # Log warnings
        for warning in warnings:
            logger.warning(f"CONFIG WARNING: {warning}")

        # Raise critical errors
        if critical_errors:
            error_message = "CRITICAL CONFIGURATION ERRORS:\n" + "\n".join([f"- {err}" for err in critical_errors])
            logger.critical(error_message)
            raise ConfigurationError(error_message)

        logger.debug("EnhancedConfig final validation complete.")
    except (ConfigurationError, ValueError, ImportError) as final_val_e:
        logger.critical(f"CRITICAL CONFIGURATION ERROR during final validation: {final_val_e}.", exc_info=True)
        raise # Re-raise critical errors

    logger.info("Configuration loading complete.")
    return cfg

# --- Global Config Instance ---
CONFIG_OBJ: Optional[EnhancedConfig] = None
PROJECT_ID: Optional[str] = None
REGION: Optional[str] = None
BUCKET_NAME: Optional[str] = None

try:
    CONFIG_OBJ = load_configuration()
    PROJECT_ID = CONFIG_OBJ.gcp.project_id
    REGION = CONFIG_OBJ.gcp.region
    BUCKET_NAME = CONFIG_OBJ.gcp.gcs_bucket_name
    logger.info("Global configuration object (CONFIG_OBJ) created successfully.")
except (ConfigurationError, ValueError, ImportError, NameError, AttributeError) as config_ve:
    logger.critical(f"CRITICAL CONFIGURATION ERROR: {config_ve}. Review environment variables and Secret Manager setup.", exc_info=True)
    # Attempt to set fallbacks for basic GCP info if possible, but CONFIG_OBJ is None
    PROJECT_ID = os.environ.get("GOOGLE_CLOUD_PROJECT")
    REGION = os.environ.get("GOOGLE_CLOUD_REGION", "us-central1")
    BUCKET_NAME = f"{PROJECT_ID}-miz3-data" if PROJECT_ID else None
    CONFIG_OBJ = None
except Exception as config_e:
     logger.critical(f"CRITICAL: Unexpected error during configuration loading: {config_e}", exc_info=True)
     PROJECT_ID = os.environ.get("GOOGLE_CLOUD_PROJECT")
     REGION = os.environ.get("GOOGLE_CLOUD_REGION", "us-central1")
     BUCKET_NAME = f"{PROJECT_ID}-miz3-data" if PROJECT_ID else None
     CONFIG_OBJ = None
     logger.error("Falling back to default/guessed PROJECT_ID/REGION/BUCKET_NAME and empty CONFIG_OBJ.")

# --- Initialize Vertex AI SDK ---
vertex_ai_initialized = False
if CONFIG_OBJ and PROJECT_ID and REGION and BUCKET_NAME and GCP_SDK_AVAILABLE and hasattr(aiplatform, 'init'):
    try:
        # Check if already initialized (simple check)
        if not getattr(aiplatform.initializer.global_config, 'project', None):
            staging_bucket_uri = f"gs://{BUCKET_NAME}/vertex_staging"
            logger.info(f"Initializing Vertex AI SDK for project {PROJECT_ID} in {REGION} with staging bucket {staging_bucket_uri}...")
            aiplatform.init(project=PROJECT_ID, location=REGION, staging_bucket=staging_bucket_uri)
            logger.info(f"Vertex AI SDK initialized.")
        else:
            logger.info(f"Vertex AI SDK already initialized for project {aiplatform.initializer.global_config.project}.")
        vertex_ai_initialized = True
    except Exception as ai_init_e:
        logger.error(f"Failed to initialize Vertex AI SDK: {ai_init_e}", exc_info=True)
elif not GCP_SDK_AVAILABLE:
     logger.warning("Vertex AI SDK not available. Skipping initialization.")
else:
     logger.warning("Config object not loaded or critical GCP values missing. Skipping Vertex AI initialization.")

# --- Check GCS Bucket Presence ---
gcs_bucket_checked = False
if CONFIG_OBJ and BUCKET_NAME and GCP_SDK_AVAILABLE and hasattr(storage, 'Client'):
    try:
        storage_client = storage.Client(project=PROJECT_ID)
        bucket = storage_client.bucket(BUCKET_NAME)
        if bucket.exists():
            logger.info(f"GCS Bucket '{BUCKET_NAME}' confirmed to exist.")
            gcs_bucket_checked = True
        else:
            logger.error(f"CRITICAL: GCS Bucket '{BUCKET_NAME}' configured but does not exist! Create the bucket or correct GCS_BUCKET_NAME.")
    except Exception as gcs_check_e:
        logger.error(f"Failed to check GCS bucket '{BUCKET_NAME}' existence: {gcs_check_e}")
elif not GCP_SDK_AVAILABLE:
    logger.warning("Storage SDK not available. Skipping GCS bucket check.")
else:
     logger.warning("GCS Bucket name not configured or PROJECT_ID missing. Skipping bucket check.")

# --- Final Status Check ---
print("\n--- MIZ 3.0 OKI Environment Status (Reworked Cell 1) ---")
if CONFIG_OBJ is None:
    print("❌ CRITICAL: Configuration loading failed. Check logs above for critical errors.")
else:
     print(f"✅ EnhancedConfig Initialized (Project: {PROJECT_ID}, Region: {REGION})")
     print(f"✅ MIZ OKI Schema Version: {CONFIG_OBJ.miz_oki_schema_version}")

     # Vertex AI Status
     if not GCP_SDK_AVAILABLE: print("🟡 WARNING: Google Cloud SDKs not fully available.")
     elif not vertex_ai_initialized: print("❌ WARNING: Vertex AI SDK initialization FAILED. Check credentials and project/region.")
     else: print("✅ Vertex AI SDK Initialized.")

     # GCS Status
     if not BUCKET_NAME: print(f"❌ CRITICAL: GCS Bucket name configuration missing (GCS_BUCKET_NAME).")
     elif not gcs_bucket_checked: print(f"❌ CRITICAL: GCS Bucket '{BUCKET_NAME}' check failed or bucket does not exist.")
     else: print(f"✅ GCS Bucket '{BUCKET_NAME}' Confirmed.")

     # Secret Manager Status
     if CONFIG_OBJ.gcp.secrets.enabled:
         if not GCP_SDK_AVAILABLE or not hasattr(secretmanager, 'SecretManagerServiceClient'): print("🟡 WARNING: Secret Manager enabled but SDK unavailable.")
         else: print("✅ Secret Manager: Enabled (Secret loading attempted). Check logs for errors.")
     else: print("ℹ️ Secret Manager: Disabled (Using Environment Variables for secrets).")

     # KG Status
     if CONFIG_OBJ.kg.storage_type == "neo4j":
          if NEO4J_AVAILABLE: print("✅ Neo4j Configured (Library found; Connectivity checked by KG component).")
          else: print(f"❌ CRITICAL: Neo4j configured but 'neo4j' library not installed.")
     else: print(f"ℹ️ KG Storage Type: '{CONFIG_OBJ.kg.storage_type}'.")

     # Vector DB Status
     if CONFIG_OBJ.kg.vector_db_type == "vertex_vector_search":
         if CONFIG_OBJ.kg.vector_db_endpoint: print("✅ Vertex Vector Search: Configured (Endpoint found).")
         else: print("🟡 WARNING: Vertex Vector Search configured but endpoint name missing (VERTEX_VECTOR_INDEX_ENDPOINT_NAME).")
     else: print(f"ℹ️ Vector DB Type: '{CONFIG_OBJ.kg.vector_db_type}'.")

     # MIZ Salt Status
     if CONFIG_OBJ.miz_salt and CONFIG_OBJ.miz_salt != "default_insecure_salt_replace_me_!!": print("✅ MIZ Salt Configured.")
     else: print("❌ CRITICAL WARNING: Using default or missing MIZ Salt! Set MIZ_SALT or MIZ_SALT_SECRET_ID.")

     # Foundation Models Status
     fm_keys_found = list(CONFIG_OBJ.foundation_models.keys.keys())
     if not fm_keys_found: print("🟡 WARNING: No Foundation Model keys/auth loaded.")
     else: print(f"✅ Foundation Models Configured. Providers with keys/auth: {fm_keys_found}")

     # Service Endpoints Status
     print(f"✅ Service Endpoints:")
     print(f"  - KG Tool API: '{CONFIG_OBJ.service_endpoints.kg_tool_api_endpoint or 'Not Set (WARNING)'}'")
     print(f"  - MoE Registry API: '{CONFIG_OBJ.service_endpoints.moe_registry_api_endpoint or 'Not Set (WARNING)'}'")
     print(f"  - Expert Invoker API: '{CONFIG_OBJ.service_endpoints.expert_invoker_api_endpoint or 'Not Set (WARNING)'}'")
     # Add checks for other critical endpoints if necessary

     print("ℹ️ Orchestration: Relies on Vertex AI Workflows/Agent Engine.")
print("-------------------------------------------------------------")
logger.info("MIZ 3.0 OKI BGI Platform - Environment configuration complete (Reworked Cell 1).")

# Check if critical errors occurred during setup
if CONFIG_OBJ is None or not gcs_bucket_checked:
    print("\n\n*** CRITICAL ERRORS DETECTED DURING SETUP. SYSTEM MAY NOT FUNCTION CORRECTLY. REVIEW LOGS. ***\n")


SyntaxError: invalid syntax (<ipython-input-1-35cc5112588d>, line 35)

In [2]:
# Cell 2: Data Extraction and KG Prep (Reworked)
# Status: Logic structured for deployment as a Tool/Service callable by Vertex AI Workflow.
#         Uses MIZ OKI I/O structure. Uses real FMClientTool proxy (Cell 18).
#         Outputs KG-ready data. Requires specific transform logic implementation.

import pandas as pd
import numpy as np
import json
import datetime
import logging
import io
import os
import time
import re
import uuid
import asyncio
import functools
from typing import Dict, Any, Optional, List, Union, Tuple
from collections import defaultdict
import csv
import chardet # For robust CSV encoding detection

# --- Cloud/External Libs ---
# Assume google-cloud-storage, google-cloud-bigquery are available via Cell 1 checks
try:
    from google.cloud import storage, bigquery, exceptions as gcp_exceptions
    GCP_SDK_AVAILABLE = True # Re-check based on Cell 1
except ImportError:
    GCP_SDK_AVAILABLE = False

try:
    import aio_gcsfs # Recommended: `pip install aio-gcsfs gcsfs`
    import gcsfs
    AIO_GCS_AVAILABLE = True
except ImportError:
    AIO_GCS_AVAILABLE = False
    logging.warning("aio-gcsfs/gcsfs not installed. GCS reads will use less efficient sync methods in threads.")

# --- Assume Real dependencies from other cells are loaded/injected ---
# These would typically be clients/proxies passed during service initialization
try:
    # Use CONFIG_OBJ loaded in Cell 1
    if 'CONFIG_OBJ' not in globals() or not CONFIG_OBJ: raise NameError("CONFIG_OBJ not found or is None")
    if not isinstance(CONFIG_OBJ, EnhancedConfig): raise NameError("CONFIG_OBJ is not an EnhancedConfig instance")

    # Use foundation_model_client proxy (representing deployed Cell 18 service)
    # This would be injected into the service running this logic
    if 'foundation_model_client' not in globals() or not foundation_model_client:
        # If running standalone, create a mock for testing
        class MockFMClientTool:
            async def extract_kg_data_from_content(self, *args, **kwargs): await asyncio.sleep(0.05); return {"status": "success", "payload": {"entities": [], "relationships": []}}
            async def summarize(self, *args, **kwargs): await asyncio.sleep(0.01); return {"status": "success", "payload": {"summary": "Mock summary."}}
            async def extract_entities(self, *args, **kwargs): await asyncio.sleep(0.01); return {"status": "success", "payload": {"entities": [{"entity": "mock"}]}}
        foundation_model_client = MockFMClientTool()
        logging.warning("Using Mock FoundationModelClient for Cell 2.")

    _config_obj = CONFIG_OBJ
    _fm_client_tool = foundation_model_client # Use real or mock client proxy
    _real_dependencies = True
    logger.debug("Using real/mock CONFIG_OBJ and foundation_model_client in Cell 2 (Reworked).")
except NameError as e:
    # This block should ideally not be hit if Cell 1 ran successfully
    logging.critical(f"CRITICAL DEPENDENCY ERROR ({e}). Cannot proceed with Cell 2 logic.", exc_info=True)
    # Define minimal mocks to prevent immediate crashes, but functionality is broken
    _config_obj = None; _fm_client_tool = None; _real_dependencies = False

logger = logging.getLogger('MIZ-OKI.DataExtractionTool')

# --- Data Validation Function (Improved Error Handling) ---
def validate_dataframe_schema(df: pd.DataFrame, schema: List[bigquery.SchemaField], file_path: str) -> Tuple[bool, pd.DataFrame, List[str]]:
    """Validates the DataFrame against the provided BigQuery schema. Handles type coercion."""
    if df.empty: return True, df, [] # Skip validation for empty dataframes
    errors = []; df_validated = df.copy()
    type_mapping = {
        "STRING": "string", "BYTES": "object", "INTEGER": "Int64", "INT64": "Int64",
        "FLOAT": "float64", "FLOAT64": "float64", "NUMERIC": "float64", "BIGNUMERIC": "float64",
        "BOOLEAN": "boolean", "BOOL": "boolean", "TIMESTAMP": "datetime64[ns, UTC]",
        "DATE": "datetime64[ns]", "TIME": "object", "DATETIME": "datetime64[ns]",
        "GEOGRAPHY": "object", "JSON": "object",
    }
    schema_dict = {col.name: col for col in schema}
    present_schema_cols = {col.name for col in schema if col.name in df_validated.columns}
    missing_required_cols = [col.name for col in schema if col.name not in df_validated.columns and col.mode == 'REQUIRED']
    if missing_required_cols:
        errors.append(f"Missing REQUIRED columns: {', '.join(missing_required_cols)}")
        # Decide whether to fail or add NA columns - failing is safer for required cols
        return False, df, errors

    for col_name in present_schema_cols:
        target_field = schema_dict[col_name]
        target_bq_type = target_field.field_type
        target_pd_type = type_mapping.get(target_bq_type)

        if target_pd_type:
            try:
                original_null_count = df_validated[col_name].isnull().sum()
                current_series = df_validated[col_name]

                if target_pd_type.startswith("datetime"):
                    # More robust datetime parsing
                    converted_series = pd.to_datetime(current_series, errors='coerce', infer_datetime_format=True)
                    if target_pd_type.endswith("UTC]"):
                        # Localize timezone-naive datetimes before converting to UTC
                        converted_series = converted_series.apply(lambda x: x.tz_localize('UTC') if x.tzinfo is None else x)
                        converted_series = converted_series.dt.tz_convert('UTC')
                    df_validated[col_name] = converted_series
                elif target_pd_type == "Int64":
                    # Coerce non-numeric to NaN before converting to Int64
                    converted_series = pd.to_numeric(current_series, errors='coerce')
                    df_validated[col_name] = converted_series.astype(target_pd_type)
                elif target_pd_type == "boolean":
                    # Handle various boolean representations
                    bool_map = {'true': True, 'false': False, '1': True, '0': False, 'yes': True, 'no': False, 't': True, 'f': False, 'y': True, 'n': False, 1: True, 0: False, 1.0: True, 0.0: False}
                    # Convert to string, lower, then map, handling potential NA values correctly
                    df_validated[col_name] = current_series.astype(str).str.lower().map(bool_map).astype(target_pd_type)
                else: # float, string, object
                    df_validated[col_name] = current_series.astype(target_pd_type)

                # Check if coercion introduced more nulls (indicating failure)
                if df_validated[col_name].isnull().sum() > original_null_count:
                    errors.append(f"Type coercion failed for some values in '{col_name}' (Expected BQ Type: {target_bq_type}). Check data format.")
            except Exception as e:
                errors.append(f"Type conversion error in column '{col_name}' (Expected BQ Type: {target_bq_type}): {e}")
        else:
            warnings.append(f"Unsupported BQ type '{target_bq_type}' for validation in column '{col_name}'. Skipping type check.")

    # Check for nulls in required columns AFTER type coercion
    for col_name, field in schema_dict.items():
        if col_name in df_validated.columns and field.mode == 'REQUIRED' and df_validated[col_name].isnull().any():
            errors.append(f"REQUIRED column '{col_name}' contains null/NA values after processing.")

    is_valid = not errors
    if not is_valid:
        logger.error(f"Schema validation errors for {file_path}:\n" + "\n".join(errors))
    else:
        logger.debug(f"Schema validation successful for {file_path}.")
    return is_valid, df_validated, errors

# --- Core Logic Functions (Callable by ADK Agent/Tool - Reworked) ---

async def read_data_source_reformed(source_type: str, source_uri_or_query: str, config: EnhancedConfig, bq_schema: Optional[List[bigquery.SchemaField]] = None) -> Optional[pd.DataFrame]:
    """ Reads data from GCS (CSV, JSONL, Parquet) or BQ asynchronously. Includes validation for CSV. """
    if not config or not config.gcp or not config.gcp.project_id:
        logger.critical("GCP configuration (project_id) is missing. Cannot read data.")
        return None
    if not GCP_SDK_AVAILABLE:
        logger.error("GCP SDKs are unavailable. Cannot read data.")
        return None

    project_id = config.gcp.project_id
    bucket_name = config.gcp.gcs_bucket_name
    start_time = time.monotonic()
    logger.info(f"Reading source. Type='{source_type}', Source='{source_uri_or_query[:100]}...'")
    df = None

    if source_type == 'gcs':
        if not bucket_name:
            logger.error("GCS Bucket name missing in config.")
            return None
        gcs_path = source_uri_or_query if source_uri_or_query.startswith("gs://") else f"gs://{bucket_name}/{source_uri_or_query.lstrip('/')}"
        actual_bucket_name = gcs_path.split('/')[2]
        blob_name = '/'.join(gcs_path.split('/')[3:])
        ext = os.path.splitext(gcs_path.lower())[1]
        logger.info(f"Target GCS path: {gcs_path} (Bucket: {actual_bucket_name}, Blob: {blob_name})")

        try:
            storage_client = storage.Client(project=project_id)
            bucket = storage_client.bucket(actual_bucket_name)
            blob = bucket.blob(blob_name)

            # Check blob existence asynchronously
            loop = asyncio.get_running_loop()
            exists = await loop.run_in_executor(None, blob.exists)
            if not exists:
                raise FileNotFoundError(f"GCS object not found: {gcs_path}")

            # --- Read content bytes asynchronously ---
            content_bytes = None
            read_start_time = time.monotonic()
            try:
                # Use aio-gcsfs if available (preferred)
                if AIO_GCS_AVAILABLE:
                    logger.debug("Using aio-gcsfs for GCS read.")
                    fs = aio_gcsfs.GCSFileSystem(project=project_id)
                    async with fs.open(gcs_path, 'rb') as f:
                        content_bytes = await f.read()
                # Fallback: Use standard client in thread
                else:
                    logger.warning("aio-gcsfs not available. Using sync GCS read in thread (less efficient).")
                    def _sync_read_gcs():
                        return blob.download_as_bytes()
                    content_bytes = await loop.run_in_executor(None, _sync_read_gcs)
                read_duration = (time.monotonic() - read_start_time) * 1000
                logger.info(f"Read {len(content_bytes or b'0')} bytes from GCS in {read_duration:.2f} ms.")
            except Exception as read_err:
                logger.error(f"Failed to read GCS object {gcs_path}: {read_err}", exc_info=True)
                return None
            # --- End Read content bytes ---

            if content_bytes is None: return None

            # --- Parse content based on extension ---
            parse_start_time = time.monotonic()
            if ext == '.csv':
                logger.info(f"Parsing CSV file: {gcs_path}")
                def _parse_csv_sync(): # Sync function for executor
                    try:
                        # Detect encoding robustly
                        detected = chardet.detect(content_bytes[:10000]) # Check first 10k bytes
                        encoding = detected['encoding'] if detected['encoding'] and detected['confidence'] > 0.7 else 'utf-8'
                        logger.info(f"Detected encoding for {gcs_path}: {encoding} (Confidence: {detected['confidence']:.2f})")
                        try:
                            # Use BytesIO for pandas
                            df_csv = pd.read_csv(io.BytesIO(content_bytes), encoding=encoding, on_bad_lines='warn', quoting=csv.QUOTE_MINIMAL, low_memory=False)
                        except UnicodeDecodeError:
                            logger.warning(f"Decode failed with {encoding}, trying utf-8 as fallback.")
                            df_csv = pd.read_csv(io.BytesIO(content_bytes), encoding='utf-8', on_bad_lines='warn', quoting=csv.QUOTE_MINIMAL, low_memory=False)
                        if df_csv.empty:
                            logger.warning(f"CSV {gcs_path} is empty or failed parse.")
                            return None
                        # Perform validation if schema provided
                        if bq_schema:
                            is_valid, df_validated, _ = validate_dataframe_schema(df_csv, bq_schema, gcs_path)
                            if not is_valid:
                                logger.error(f"Schema validation failed for {gcs_path}. Skipping.")
                                return None
                            return df_validated
                        else:
                            return clean_generic_data(df_csv) # Generic cleaning if no schema
                    except Exception as e_inner:
                        logger.error(f"Error parsing CSV {gcs_path} in thread: {e_inner}", exc_info=True)
                        return None
                df = await loop.run_in_executor(None, _parse_csv_sync)

            elif ext in ['.jsonl', '.json']:
                logger.info(f"Parsing JSON(L) file: {gcs_path}")
                try:
                    df = pd.read_json(io.BytesIO(content_bytes), lines=(ext == '.jsonl'), encoding='utf-8')
                    df = clean_generic_data(df)
                except Exception as json_e:
                    logger.error(f"Failed to parse JSON(L) file {gcs_path}: {json_e}", exc_info=True)
                    return None
            elif ext == '.parquet':
                logger.info(f"Parsing Parquet file: {gcs_path}")
                try:
                    df = pd.read_parquet(io.BytesIO(content_bytes))
                    df = clean_generic_data(df)
                except Exception as pq_e:
                    logger.error(f"Failed to parse Parquet file {gcs_path}: {pq_e}", exc_info=True)
                    return None
            else:
                logger.warning(f"Unsupported GCS file extension: {ext}")
                return None

            parse_duration = (time.monotonic() - parse_start_time) * 1000
            if df is not None: logger.info(f"Parsed GCS data ({df.shape=}) in {parse_duration:.2f} ms.")
            # --- End Parse content ---

        except FileNotFoundError:
            logger.error(f"GCS file/object not found: {gcs_path}")
            return None
        except gcp_exceptions.GoogleAPIError as api_e:
             logger.error(f"GCP API error accessing GCS {gcs_path}: {api_e}", exc_info=True)
             return None
        except Exception as e:
            logger.error(f"Error processing GCS source {gcs_path}: {e}", exc_info=True)
            return None

    elif source_type == 'bq':
        logger.info(f"Reading BigQuery: {source_uri_or_query[:150]}...")
        try:
            bq_client = bigquery.Client(project=project_id)
            # Run sync BQ client call in a thread
            def query_bq_sync():
                # Consider adding query parameters for security if query is dynamic
                return bq_client.query(source_uri_or_query).to_dataframe(create_bqstorage_client=True) # Use Storage API for speed
            df = await asyncio.to_thread(query_bq_sync)
            df = clean_generic_data(df) # Generic cleaning
        except gcp_exceptions.GoogleAPIError as api_e:
             logger.error(f"GCP API error querying BigQuery: {api_e}", exc_info=True)
             return None
        except Exception as e:
            logger.error(f"Error reading BQ: {e}", exc_info=True)
            return None
    else:
        logger.error(f"Unsupported data source type: {source_type}")
        return None

    if df is None:
        logger.warning(f"No DataFrame produced for source: {source_uri_or_query}")
        return None

    total_duration = (time.monotonic() - start_time) * 1000
    logger.info(f"Successfully processed source ({df.shape=}) in {total_duration:.2f} ms.")
    return df

# --- Transformation Helpers (Remain sync, same as before, but ensure robustness) ---
def _add_entity(entities_list: List[Dict], entity_dict: Dict) -> bool:
    """Safely adds a validated entity dictionary to the list."""
    if not isinstance(entity_dict, dict): logger.warning(f"Skipping non-dict entity: {type(entity_dict)}"); return False
    if not entity_dict.get('type') or not entity_dict.get('_resolution_hints'):
        logger.warning(f"Skipping entity: missing 'type' or '_resolution_hints': {str(entity_dict)[:150]}...")
        return False
    cleaned_entity = {}
    for k, v in entity_dict.items():
        if pd.isna(v): continue # Skip NaN/NA values
        # Convert numpy types to standard Python types
        if isinstance(v, (np.integer, np.int64)): v = int(v)
        elif isinstance(v, (np.floating, np.float64)):
            if np.isnan(v) or np.isinf(v): continue # Skip NaN/inf floats
            v = float(v)
        elif isinstance(v, (datetime.datetime, datetime.date, pd.Timestamp)):
            try: v = pd.to_datetime(v).isoformat() # Ensure ISO format
            except Exception: v = str(v) # Fallback to string if conversion fails
        elif isinstance(v, (list, dict)):
             try: json.dumps(v, default=str); # Test serializability
             except TypeError: v = str(v); logger.debug(f"Converted unserializable {type(v)} to string for key '{k}'.")
        elif not isinstance(v, (str, int, float, bool)):
             v = str(v) # Convert other types to string as fallback

        # Final check for NaN floats that might have slipped through
        if isinstance(v, float) and np.isnan(v): continue

        cleaned_entity[k] = v
    entities_list.append(cleaned_entity)
    return True

def _add_relationship(relationships_list: List[Dict], rel_dict: Dict) -> bool:
    """Safely adds a validated relationship dictionary to the list."""
    if not isinstance(rel_dict, dict): logger.warning(f"Skipping non-dict relationship: {type(rel_dict)}"); return False
    if not rel_dict.get('type') or not rel_dict.get('source_hints') or not rel_dict.get('target_hints'):
        logger.warning(f"Skipping relationship: missing 'type', 'source_hints', or 'target_hints': {str(rel_dict)[:150]}...")
        return False
    cleaned_rel = {}
    for k, v in rel_dict.items():
        if pd.isna(v): continue
        if isinstance(v, (np.integer, np.int64)): v = int(v)
        elif isinstance(v, (np.floating, np.float64)):
            if np.isnan(v) or np.isinf(v): continue
            v = float(v)
        elif isinstance(v, (datetime.datetime, datetime.date, pd.Timestamp)):
            try: v = pd.to_datetime(v).isoformat()
            except Exception: v = str(v)
        elif isinstance(v, (list, dict)):
             try: json.dumps(v, default=str);
             except TypeError: v = str(v); logger.debug(f"Converted unserializable {type(v)} to string for key '{k}'.")
        elif not isinstance(v, (str, int, float, bool)):
             v = str(v)
        if isinstance(v, float) and np.isnan(v): continue
        cleaned_rel[k] = v
    relationships_list.append(cleaned_rel)
    return True

def _safe_get(row: Union[pd.Series, Dict], key: Optional[str], default: Any = None) -> Any:
    """Safely get value from dict or Series, handling NA/None."""
    if key is None: return default
    val = default
    try:
        if isinstance(row, dict): val = row.get(key, default)
        elif isinstance(row, pd.Series): val = row.get(key, default)
        elif hasattr(row, key): val = getattr(row, key, default) # For objects
        else: return default
        # Return default if value is NaN, None, or NaT
        return default if pd.isna(val) else val
    except (TypeError, KeyError, AttributeError):
        return default

def _parse_date(date_str: Any, default: Optional[str] = None) -> Optional[str]:
    """Parse date string robustly into ISO format."""
    if date_str is None or pd.isna(date_str) or str(date_str).strip() == '': return default
    try:
        # Attempt parsing with pandas, coercing errors
        dt_obj = pd.to_datetime(date_str, errors='coerce', infer_datetime_format=True)
        # Return ISO format if valid, otherwise return original string (or default)
        return dt_obj.isoformat() if pd.notna(dt_obj) else str(date_str)
    except Exception:
        # Fallback to string representation if parsing fails unexpectedly
        return str(date_str)

def map_to_funnel_stage(event: str, funnel_stages_config: Dict[str, List[str]]) -> Optional[str]:
    """Maps an event name to a funnel stage based on config."""
    if not event or not isinstance(event, str) or not funnel_stages_config: return None
    event_lower = event.lower().strip()
    for stage, events in funnel_stages_config.items():
        if isinstance(events, list) and event_lower in [e.lower().strip() for e in events]:
            return stage
    return None # Return None if no match found

# --- Platform Specific Transformations (Stubs with Guidance - Reworked Signatures) ---

def _transform_facebook_ads_reformed(df: pd.DataFrame, config: EnhancedConfig) -> Tuple[List[Dict], List[Dict]]:
    entities = []; relationships = []; errors = []
    logger.info(f"Running Facebook Ads transformation on {len(df)} rows...")
    required_cols = ['campaign_id', 'ad_id', 'date', 'spend', 'impressions', 'clicks'] # Example required
    missing_cols = [col for col in required_cols if col not in df.columns]
    if missing_cols: raise ValueError(f"Missing required FB Ads columns: {missing_cols}")

    # --- Placeholder: IMPLEMENTATION NEEDED ---
    # Iterate through df rows (df.itertuples() is faster)
    # Extract IDs, create hints, create entity/relationship dicts using _safe_get, _parse_date
    # Use _add_entity and _add_relationship
    # Log errors encountered during row processing
    # Example (Partial):
    # for row in df.itertuples(index=False):
    #     try:
    #         campaign_id = str(_safe_get(row, 'campaign_id')); ad_id = str(_safe_get(row, 'ad_id'))
    #         camp_hints = {"platform": "facebook", "type": "Campaign", "original_id": campaign_id}
    #         ad_hints = {"platform": "facebook", "type": "Ad", "original_id": ad_id}
    #         # ... create entity/rel dicts ...
    #         _add_entity(entities, campaign_data)
    #         _add_entity(entities, ad_data)
    #         _add_relationship(relationships, rel_data)
    #     except Exception as e: errors.append(...)
    # --- End Placeholder ---

    logger.info(f"FB Ads transform yielded {len(entities)} entities, {len(relationships)} relationships, {len(errors)} row errors.")
    if errors: logger.warning(f"FB Ads transform errors: {errors[:5]}") # Log first few errors
    return entities, relationships

def _transform_shopify_reformed(df: pd.DataFrame, config: EnhancedConfig) -> Tuple[List[Dict], List[Dict]]:
    entities = []; relationships = []; errors = []
    logger.info(f"Running Shopify transformation on {len(df)} rows...")
    required_cols = ['order_id', 'customer_id', 'created_at', 'total_price', 'email'] # Example
    missing_cols = [col for col in required_cols if col not in df.columns]
    if missing_cols: raise ValueError(f"Missing required Shopify columns: {missing_cols}")
    # Access funnel stages config safely
    funnel_stages_config = config.get("business_impact.funnel_stages", {}) if config else {}

    # --- Placeholder: IMPLEMENTATION NEEDED ---
    # Iterate, extract, create hints, create dicts, use helpers _add_entity, _add_relationship
    # Use map_to_funnel_stage for events
    # --- End Placeholder ---

    logger.info(f"Shopify transform yielded {len(entities)} entities, {len(relationships)} relationships, {len(errors)} row errors.")
    if errors: logger.warning(f"Shopify transform errors: {errors[:5]}")
    return entities, relationships

def _transform_ga4_reformed(df: pd.DataFrame, config: EnhancedConfig) -> Tuple[List[Dict], List[Dict]]:
    entities = []; relationships = []; errors = []
    logger.info(f"Running GA4 transformation on {len(df)} rows...")
    required_cols = ['event_name', 'user_pseudo_id', 'event_timestamp'] # Example
    missing_cols = [col for col in required_cols if col not in df.columns]
    if missing_cols: raise ValueError(f"Missing required GA4 columns: {missing_cols}")
    funnel_stages_config = config.get("business_impact.funnel_stages", {}) if config else {}

    # --- Placeholder: IMPLEMENTATION NEEDED ---
    # Iterate, extract, create hints, create dicts, use helpers _add_entity, _add_relationship
    # Use map_to_funnel_stage for events
    # --- End Placeholder ---

    logger.info(f"GA4 transform yielded {len(entities)} entities, {len(relationships)} relationships, {len(errors)} row errors.")
    if errors: logger.warning(f"GA4 transform errors: {errors[:5]}")
    return entities, relationships

# --- TODO: Implement other _transform_* functions similarly ---
def _transform_google_ads_reformed(df: pd.DataFrame, config: EnhancedConfig) -> Tuple[List[Dict], List[Dict]]:
    logger.warning("Google Ads transform not implemented."); return [], []
def _transform_klaviyo_reformed(df: pd.DataFrame, config: EnhancedConfig) -> Tuple[List[Dict], List[Dict]]:
    logger.warning("Klaviyo transform not implemented."); return [], []
def _transform_support_logs_reformed(df: pd.DataFrame, config: EnhancedConfig) -> Tuple[List[Dict], List[Dict]]:
    logger.warning("Support Logs transform (likely semantic) not implemented."); return [], []

def _transform_generic_reformed(df: pd.DataFrame, data_type: str, config: EnhancedConfig) -> Tuple[List[Dict], List[Dict]]:
    """Generic transformation for unknown structured data types."""
    entities = []; relationships = []; errors = []; id_col = None
    if df.empty: return [], []
    common_id_names = ['id', 'uuid', 'key', 'identifier', 'record_id', 'objectid', 'pk']
    df_cols_lower = {c.lower(): c for c in df.columns}
    for potential_id in common_id_names:
        if potential_id in df_cols_lower: id_col = df_cols_lower[potential_id]; break
    if not id_col and len(df.columns) > 0: id_col = df.columns[0] # Fallback to first column

    if id_col:
         logger.info(f"Using column '{id_col}' as primary identifier for generic transform of '{data_type}'.")
         for index, row in df.iterrows():
              row_info = f"Row Index: {index}"
              try:
                  original_id_val = _safe_get(row, id_col)
                  original_id = str(original_id_val) if original_id_val is not None and str(original_id_val).strip() != '' else f"generic_{data_type}_{index}"
                  hints = {"data_type": data_type, "type": "GenericData", "original_id": original_id}
                  # Include all non-null properties
                  entity_data = {"type": "GenericData", "_resolution_hints": hints, "original_id": original_id}
                  for col, val in row.items():
                      if pd.notna(val): entity_data[col] = val # Let _add_entity handle type conversion
                  _add_entity(entities, entity_data)
              except Exception as e: errors.append({"row_info": row_info, "error": str(e)})
    else: logger.error(f"Cannot perform generic transform for '{data_type}': No suitable ID column found.")
    logger.info(f"Generic Transform for '{data_type}' yielded {len(entities)} entities, {len(errors)} row errors.")
    if errors: logger.warning(f"Generic transform errors: {errors[:5]}")
    return entities, relationships

# --- Main Transformation Function (Reworked for MIZ OKI I/O) ---
async def extract_and_transform_for_kg_reformed(input_data: Dict[str, Any]) -> Dict[str, Any]:
    """
    Reads, validates, transforms data into KG-ready format asynchronously.
    Expects input_data to be a dict representing the MIZ OKI payload from a workflow step.
    Returns a dict representing the MIZ OKI response payload.
    """
    # --- MIZ OKI Payload Parsing ---
    miz_oki_version = input_data.get("miz_oki_version", "unknown")
    request_id = input_data.get("request_id", f"req_{uuid.uuid4().hex[:8]}")
    trace_id = input_data.get("trace_id", f"trace_{uuid.uuid4().hex[:8]}")
    workflow_execution_id = input_data.get("workflow_execution_id")
    step_id = input_data.get("step_id")
    source_component = input_data.get("source_component")
    payload = input_data.get("payload", {})
    source_type = payload.get("source_type") # e.g., 'gcs', 'bq'
    source_uri_or_query = payload.get("source_uri_or_query") # e.g., 'gs://bucket/file.csv', 'SELECT * ...'
    data_type_hint = payload.get("data_type_hint") # e.g., 'facebook_ads', 'shopify', 'ga4'
    bq_schema_list = payload.get("bq_schema") # Optional: List of dicts for schema validation

    task_id = f"extract_{uuid.uuid4().hex[:8]}"
    start_time_task = time.monotonic()
    logger.info(f"Starting KG extraction/transformation async (TaskID: {task_id}, TraceID: {trace_id}, WorkflowExec: {workflow_execution_id}, Step: {step_id}): Type='{source_type}', Source='{source_uri_or_query[:100]}...'")

    # --- Prepare MIZ OKI Response Structure ---
    response = {
        "miz_oki_version": _config_obj.miz_oki_schema_version if _config_obj else "unknown",
        "request_id": request_id, "trace_id": trace_id, "workflow_execution_id": workflow_execution_id, "step_id": step_id,
        "timestamp": datetime.now(datetime.timezone.utc).isoformat(),
        "source_component": "DataExtractionTool", # This component's name
        "target_component": source_component, # Responding to the caller
        "status": "unknown", # Will be updated: success, partial_success, failed_validation, skipped_no_data, failed_transform, critical_failure
        "payload": {"kg_entities": [], "kg_relationships": [], "log": {}},
        "error_details": None,
        "metadata": {}
    }
    run_log = response["payload"]["log"] # Reference for easier logging
    run_log.update({"task_id": task_id, "source_type": source_type, "source": source_uri_or_query, "status": "started", "start_time_iso": response["timestamp"], "steps": {}})
    errors = [] # Local error list for non-critical issues

    # --- Dependency Check ---
    if not _config_obj or not _fm_client_tool:
        error_msg = "Critical configuration or FM Client Tool dependency missing."
        logger.critical(f"Task {task_id}: {error_msg}")
        response["status"] = "critical_failure"; response["error_details"] = [{"code": "DEPENDENCY_MISSING", "message": error_msg}]
        run_log["status"] = "critical_failure"; run_log["error"] = error_msg
        run_log["total_duration_ms"] = (time.monotonic() - start_time_task) * 1000
        response["metadata"]["processing_duration_ms"] = run_log["total_duration_ms"]
        return response
    # --- End Dependency Check ---

    try:
        if not source_type or not source_uri_or_query:
            raise ValueError("Missing 'source_type' or 'source_uri_or_query' in input payload.")

        # Convert BQ schema dict list to SchemaField objects if provided
        bq_schema_obj: Optional[List[bigquery.SchemaField]] = None
        if bq_schema_list and isinstance(bq_schema_list, list) and GCP_SDK_AVAILABLE and hasattr(bigquery, 'SchemaField'):
            try:
                bq_schema_obj = [bigquery.SchemaField.from_api_repr(field_dict) for field_dict in bq_schema_list]
                logger.debug(f"Task {task_id}: Successfully parsed BQ schema.")
            except Exception as schema_e:
                logger.warning(f"Task {task_id}: Failed to parse BQ schema: {schema_e}. Proceeding without schema validation for CSV.")
                errors.append({"code": "SCHEMA_PARSE_ERROR", "message": f"Failed to parse BQ schema: {schema_e}"})

        # 1. Read Data Async
        step_start = time.monotonic()
        df = await read_data_source_reformed(source_type, source_uri_or_query, _config_obj, bq_schema_obj)
        read_duration = (time.monotonic() - step_start) * 1000
        run_log["steps"]["read_data"] = {"duration_ms": read_duration, "source_type": source_type}

        if df is None: # Handles read errors OR validation failures during read
            response["status"] = "failed_validation" if bq_schema_obj and source_type == 'gcs' and source_uri_or_query.endswith('.csv') else "skipped_no_data"
            run_log["status"] = response["status"]
            run_log["steps"]["read_data"]["status"] = "no_data_or_validation_failed"
            error_msg = "No data read from source or schema validation failed during read."
            logger.warning(f"Task {task_id}: {error_msg}")
            errors.append({"code": "NO_DATA_OR_VALIDATION", "message": error_msg})
            # No critical failure here, just no data to process
        elif df.empty:
            response["status"] = "skipped_no_data"
            run_log["status"] = response["status"]
            run_log["steps"]["read_data"]["status"] = "empty_dataframe"
            logger.info(f"Task {task_id}: Source contained no data rows.")
        else:
            run_log["rows_read"] = len(df)
            run_log["steps"]["read_data"]["rows_read"] = len(df)
            run_log["steps"]["read_data"]["status"] = "success"
            logger.info(f"Task {task_id}: Read {len(df)} rows in {read_duration:.2f} ms.")

            # 2. Determine Data Type
            data_type = data_type_hint
            if not data_type: # Auto-detect based on URI/query content
                src_lower = source_uri_or_query.lower()
                if 'facebook' in src_lower or 'meta' in src_lower: data_type = 'facebook_ads'
                elif 'shopify' in src_lower: data_type = 'shopify'
                elif 'googleads' in src_lower or 'adwords' in src_lower: data_type = 'google_ads'
                elif 'analytics' in src_lower or 'ga4' in src_lower: data_type = 'ga4'
                elif 'klaviyo' in src_lower: data_type = 'klaviyo'
                elif 'support' in src_lower or 'ticket' in src_lower or 'log' in src_lower: data_type = 'support_logs' # Could be semantic
                # Add more specific detections
                else: data_type = "generic"; logger.warning(f"Task {task_id}: Could not auto-detect data type from '{source_uri_or_query[:50]}...'. Defaulting to 'generic'.")
            run_log["data_type"] = data_type
            run_log["steps"]["validation"] = {"data_type": data_type, "status": "performed_during_read" if source_type=='gcs' and source_uri_or_query.endswith('.csv') and bq_schema_obj else "skipped_no_schema_or_not_csv"}
            df_validated = df # Already validated or cleaned during read

            # 3. Transform Data
            step_start = time.monotonic()
            entities, relationships = [], []
            # Define types likely needing semantic processing
            unstructured_types = {'text_log', 'support_transcript', 'external_news', 'returns_data', 'web_page_content', 'support_logs'}
            use_semantic = (data_type in unstructured_types)
            run_log["steps"]["transform"] = {"type": "semantic" if use_semantic else "rule_based", "status": "pending"}

            if use_semantic:
                logger.info(f"Task {task_id}: Applying SEMANTIC processing via FMClientTool API for: {data_type}")
                model_alias = _config_obj.foundation_models.defaults.llama4_maverick # Or choose based on data_type
                # --- Semantic processing logic (using _fm_client_tool proxy) ---
                text_col = 'text_content' if 'text_content' in df_validated.columns else 'content' if 'content' in df_validated.columns else None # Find text column
                if text_col and not df_validated.empty:
                    batch_size = 10 # Adjust batch size based on API limits and content size
                    extraction_tasks = []
                    # Ensure unique ID for context passing
                    if 'miz_internal_id' not in df_validated.columns: df_validated['miz_internal_id'] = [f"row_{i}" for i in range(len(df_validated))]

                    for i in range(0, len(df_validated), batch_size):
                        batch_df = df_validated.iloc[i:i+batch_size]
                        batch_content = batch_df[text_col].astype(str).tolist()
                        batch_ids = batch_df['miz_internal_id'].astype(str).tolist()
                        batch_tasks_inner = []
                        for idx, content_item in enumerate(batch_content):
                            if not content_item or pd.isna(content_item) or not str(content_item).strip(): continue
                            original_id = batch_ids[idx]
                            # Prepare MIZ OKI payload for FM Client Tool API call
                            fm_request_payload = {
                                "payload": {
                                    "content": content_item,
                                    "data_type": data_type,
                                    "model_alias": model_alias,
                                    "context": {"original_id": original_id, "source_uri": source_uri_or_query}
                                },
                                "trace_id": trace_id, "request_id": f"{request_id}_fm_{original_id}"
                            }
                            # Call FM Client Tool API (via proxy)
                            task = _fm_client_tool.extract_kg_data_from_content(input_data=fm_request_payload)
                            batch_tasks_inner.append(task)
                        if batch_tasks_inner: extraction_tasks.append(asyncio.gather(*batch_tasks_inner, return_exceptions=True))

                    if extraction_tasks:
                        all_batch_results = await asyncio.gather(*extraction_tasks)
                        for batch_results in all_batch_results:
                            for fm_response in batch_results: # fm_response is the MIZ OKI response dict from the FM tool
                                if isinstance(fm_response, Exception):
                                    err_detail = f"Semantic extraction API call failed: {fm_response}"; logger.error(err_detail); errors.append({"code": "SEMANTIC_API_ERROR", "message": err_detail})
                                elif isinstance(fm_response, dict) and fm_response.get("status") == "success":
                                     fm_payload = fm_response.get("payload", {})
                                     # Use helpers to add entities/rels, ensuring validation
                                     for e_dict in fm_payload.get("entities", []): _add_entity(entities, e_dict)
                                     for r_dict in fm_payload.get("relationships", []): _add_relationship(relationships, r_dict)
                                elif isinstance(fm_response, dict): # Handle errors reported by FM tool
                                     err_detail = f"Semantic extraction failed in FM Tool: {fm_response.get('error_details')}"
                                     logger.error(err_detail); errors.append({"code": "SEMANTIC_TOOL_ERROR", "message": err_detail})
                                else: logger.warning(f"Task {task_id}: Invalid format received from semantic extraction API: {type(fm_response)}.")
                else: logger.warning(f"Task {task_id}: Suitable text column ('{text_col}') not found or DataFrame empty for semantic extraction.")
                # --- End Semantic processing ---
            else: # Rule-Based Transformation
                 logger.info(f"Task {task_id}: Applying RULE-BASED transformation for: {data_type}")
                 transform_func_map = {
                     'facebook_ads': _transform_facebook_ads_reformed,
                     'shopify': _transform_shopify_reformed,
                     'ga4': _transform_ga4_reformed,
                     'google_ads': _transform_google_ads_reformed,
                     'klaviyo': _transform_klaviyo_reformed,
                     'support_logs': _transform_support_logs_reformed, # Could also be semantic
                     'generic': _transform_generic_reformed,
                 }
                 # Get the appropriate function, default to generic
                 transform_func = transform_func_map.get(data_type, _transform_generic_reformed)
                 try:
                      loop = asyncio.get_running_loop()
                      # Pass config object for access to parameters like funnel stages
                      if transform_func == _transform_generic_reformed:
                          # Generic function needs data_type explicitly
                          func_args = (df_validated.copy(), data_type, _config_obj)
                      else:
                          func_args = (df_validated.copy(), _config_obj)
                      # Run the synchronous transform function in a thread
                      entities_t, relationships_t = await loop.run_in_executor(None, functools.partial(transform_func, *func_args))
                      entities.extend(entities_t); relationships.extend(relationships_t)
                 except Exception as transform_e:
                      logger.error(f"Task {task_id}: Critical failure during rule-based transformation '{data_type}': {transform_e}", exc_info=True)
                      errors.append({"code": "TRANSFORM_CRITICAL", "message": f"Critical transform error: {transform_e}"})
                      response["status"] = "failed_transform" # Mark as failed if transform crashes

            run_log["steps"]["transform"]["duration_ms"] = (time.monotonic() - step_start) * 1000
            run_log["steps"]["transform"]["status"] = "success" if response["status"] != "failed_transform" else "failed"
            run_log["entities_extracted"] = len(entities)
            run_log["relationships_extracted"] = len(relationships)

            # Final processing and status determination only if read was successful
            response["payload"]["kg_entities"] = entities
            response["payload"]["kg_relationships"] = relationships
            entity_count = len(entities); relationship_count = len(relationships); error_count = len(errors)

            if response["status"] == "unknown": # Check if status was set during transformation errors
                if error_count == 0:
                    response["status"] = "success" if entity_count > 0 or relationship_count > 0 else "success_no_results"
                else:
                    # Partial success if some entities/rels were generated despite row/API errors
                    response["status"] = "partial_success" if entity_count > 0 or relationship_count > 0 else "failed_transform"

        # Update final status in log
        run_log["status"] = response["status"]
        run_log["error_count"] = len(errors)
        run_log["total_duration_ms"] = (time.monotonic() - start_time_task) * 1000
        response["metadata"]["processing_duration_ms"] = run_log["total_duration_ms"]
        if errors: response["error_details"] = errors

        logger.info(f"Task {task_id} FINISHED. Status: {response['status']}. Entities: {run_log.get('entities_extracted', 0)}, Rels: {run_log.get('relationships_extracted', 0)}, Errors: {run_log.get('error_count', 0)}. Total Time: {run_log['total_duration_ms']:.2f} ms")
        return response

    except Exception as outer_e:
        # Catch any unexpected critical errors in the main flow
        logger.critical(f"CRITICAL failure in ETL Task (TaskID: {task_id}, TraceID: {trace_id}): {outer_e}", exc_info=True)
        response["status"] = "critical_failure"
        errors.append({"code": "CRITICAL", "message": str(outer_e)})
        response["error_details"] = errors
        run_log["status"] = "critical_failure"; run_log["error"] = str(outer_e)
        run_log["total_duration_ms"] = (time.monotonic() - start_time_task) * 1000
        response["metadata"]["processing_duration_ms"] = run_log["total_duration_ms"]
        return response

# --- Generic Data Cleaning Function (Improved) ---
def clean_generic_data(df: pd.DataFrame) -> pd.DataFrame:
    """Applies generic cleaning to DataFrames: handles common nulls, attempts type inference."""
    if df.empty: return df
    df_cleaned = df.copy()
    logger.debug(f"Starting generic cleaning for DataFrame with shape {df.shape}...")
    try:
        # Define common null representations
        null_values = ['null', 'NULL', '', '#N/A', 'N/A', 'NaN', 'nan', 'None', '<NA>', 'undefined', 'missing']
        df_cleaned = df_cleaned.replace(null_values, pd.NA) # Use pandas NA consistently

        for col in df_cleaned.columns:
            original_dtype = df_cleaned[col].dtype
            # Skip if already a specific desired type (like datetime from BQ)
            if pd.api.types.is_datetime64_any_dtype(original_dtype) or pd.api.types.is_numeric_dtype(original_dtype) or pd.api.types.is_bool_dtype(original_dtype):
                continue

            # Attempt numeric conversion if object type
            if pd.api.types.is_object_dtype(original_dtype) or pd.api.types.is_string_dtype(original_dtype):
                try:
                    # Try converting to numeric, coercing errors to NA
                    converted_numeric = pd.to_numeric(df_cleaned[col], errors='coerce')
                    # If mostly numeric (e.g., >80% non-NA after conversion), keep it numeric
                    if converted_numeric.notna().sum() / len(df_cleaned[col]) > 0.8:
                        # Use Int64 if possible (no decimals and fits in int64)
                        if converted_numeric.dropna().apply(lambda x: x == int(x) if pd.notna(x) else True).all():
                             df_cleaned[col] = converted_numeric.astype(pd.Int64Dtype())
                        else:
                             df_cleaned[col] = converted_numeric.astype(pd.Float64Dtype())
                        logger.debug(f"Column '{col}': Inferred as numeric ({df_cleaned[col].dtype}).")
                        continue # Move to next column
                except (ValueError, TypeError):
                    pass # Ignore errors if conversion fails, proceed to other types

            # Attempt datetime conversion if object type and not successfully converted to numeric
            if pd.api.types.is_object_dtype(df_cleaned[col].dtype) or pd.api.types.is_string_dtype(df_cleaned[col].dtype):
                try:
                    converted_datetime = pd.to_datetime(df_cleaned[col], errors='coerce', infer_datetime_format=True)
                    # If mostly datetime, keep it
                    if converted_datetime.notna().sum() / len(df_cleaned[col]) > 0.8:
                        df_cleaned[col] = converted_datetime
                        logger.debug(f"Column '{col}': Inferred as datetime.")
                        continue
                except (ValueError, TypeError):
                     pass # Ignore errors

            # Attempt boolean conversion if object type and not converted yet
            if pd.api.types.is_object_dtype(df_cleaned[col].dtype) or pd.api.types.is_string_dtype(df_cleaned[col].dtype):
                 unique_vals = df_cleaned[col].dropna().unique()
                 if len(unique_vals) <= 5: # Limit boolean check to few unique values
                      lowered_unique = {str(v).lower() for v in unique_vals}
                      bool_representations = {'true', 'false', '1', '0', 'yes', 'no', 't', 'f', 'y', 'n', '1.0', '0.0'}
                      if lowered_unique.issubset(bool_representations):
                           bool_map = {'true': True, 'false': False, '1': True, '0': False, 'yes': True, 'no': False, 't': True, 'f': False, 'y': True, 'n': False, '1.0': True, '0.0': False}
                           df_cleaned[col] = df_cleaned[col].astype(str).str.lower().map(bool_map).astype(pd.BooleanDtype())
                           logger.debug(f"Column '{col}': Inferred as boolean.")
                           continue

            # If still object/string, ensure it's nullable string type
            if pd.api.types.is_object_dtype(df_cleaned[col].dtype) or pd.api.types.is_string_dtype(df_cleaned[col].dtype):
                 df_cleaned[col] = df_cleaned[col].astype(pd.StringDtype())
                 logger.debug(f"Column '{col}': Kept as string.")

    except Exception as e:
        logger.error(f"Error during generic cleaning: {e}", exc_info=True)
        return df # Return original df on error
    logger.debug("Generic cleaning finished.")
    return df_cleaned


# --- Example Invocation (Conceptual - within a deployed service/function) ---
# async def handle_workflow_request(request_json: Dict):
#     # 1. Parse request_json (assume it's the MIZ OKI input_data dict)
#     # 2. Call the main processing function
#     response_dict = await extract_and_transform_for_kg_reformed(request_json)
#     # 3. Return response_dict (e.g., as JSON response)
#     return response_dict

print("\n--- MIZ 3.0 Data Extraction Logic (Cell 2 - Reworked) ---")
print("Refactored as Tool/Service logic using MIZ OKI I/O structure.")
print("Uses async I/O and calls FM Client Tool API proxy.")
print("Requires implementation of specific _transform_* functions.")
print("-------------------------------------------------------------")

CRITICAL:root:CRITICAL DEPENDENCY ERROR (CONFIG_OBJ not found or is None). Cannot proceed with Cell 2 logic.
Traceback (most recent call last):
  File "<ipython-input-2-6d37864f042f>", line 43, in <cell line: 41>
    if 'CONFIG_OBJ' not in globals() or not CONFIG_OBJ: raise NameError("CONFIG_OBJ not found or is None")
NameError: CONFIG_OBJ not found or is None


NameError: name 'EnhancedConfig' is not defined

In [4]:
4. Updated Cells (Continued)

Cell 3: Knowledge Graph Layer Implementation (Reworked)

Original Purpose: Synchronous Neo4j adapter, no Vector DB, basic XAI storage, no MIZ OKI API layer.
Key Changes: Neo4jAsyncAdapter uses async driver. Includes routing logic for Vector DB (Vertex AI Vector Search via client proxy). Implements structured XAI storage (save/retrieve_decision_record). Adds DataPseudonymizer. Wraps adapter logic in a conceptual KnowledgeGraphToolService with FastAPI-style endpoint methods (*_endpoint) handling MIZ OKI request/response payloads. All DB/API operations are now async. Added robust error handling and dependency checks.
Reworked Code:
# Cell 3: Knowledge Graph Layer Implementation (Reworked)
# Status: Neo4jAsyncAdapter includes Vector DB routing & structured XAI storage.
#         Conceptual FastAPI service layer added, handling MIZ OKI payloads & pseudonymization.
#         Uses async operations for DB and external Vector DB calls.

import os
import numpy as np
import pandas as pd
import datetime
import json
import logging
import time
import uuid
import hashlib
from collections import defaultdict, deque
from contextlib import contextmanager, asynccontextmanager
from typing import Dict, Any, Optional, List, Union, Tuple, Set, Type, Protocol, Callable
from abc import ABC, abstractmethod
import asyncio

# --- Framework for API Service (Conceptual) ---
# These imports are needed if deploying as a FastAPI service
try:
    from fastapi import FastAPI, HTTPException, Body, Depends, Request, Response, status
    from pydantic import BaseModel, Field, validator
    FASTAPI_AVAILABLE = True
except ImportError:
    FASTAPI_AVAILABLE = False
    # Dummy classes if FastAPI not installed (won't run, just for structure)
    def Depends(dep): return None; class FastAPI: pass; class HTTPException(Exception): pass; def Body(...): return ...; class BaseModel: pass; def Field(...): return ...; class Request: pass; class Response: pass; class status: pass
    logging.warning("FastAPI or Pydantic not installed. API service layer cannot be fully defined.")

# --- Neo4j Integration ---
try:
    # Use the async driver
    from neo4j import AsyncGraphDatabase, basic_auth, exceptions as neo4j_exceptions, AsyncSession, AsyncTransaction, AsyncDriver
    NEO4J_AVAILABLE = True
except ImportError:
    NEO4J_AVAILABLE = False
    # Dummy classes if Neo4j async driver not installed
    class AsyncDriver: pass; class AsyncSession: pass; class AsyncTransaction: pass; class AsyncGraphDatabase: pass
    if 'neo4j_exceptions' not in locals(): class neo4j_exceptions: class ClientError(Exception): pass; class TransientError(Exception): pass; class AuthError(Exception): pass; class ServiceUnavailable(Exception): pass
    logging.warning("Neo4j async driver not found. Neo4j functionality unavailable.")

# --- Vector DB Integration ---
VECTOR_DB_AVAILABLE = False
try:
    # Using Vertex AI Vector Search client library
    from google.cloud import aiplatform
    from google.cloud.aiplatform.matching_engine import MatchingEngineIndexEndpoint, Namespace
    from google.cloud.aiplatform_v1.types import FindNeighborsRequest, FindNeighborsResponse # For type hints
    # Ensure Vertex AI SDK is initialized (should happen in Cell 1)
    if hasattr(aiplatform, 'initializer') and getattr(aiplatform.initializer.global_config, 'project', None):
        VECTOR_DB_AVAILABLE = True
        logging.info("Vertex AI Matching Engine SDK found and Vertex AI initialized.")
    else:
        logging.warning("Vertex AI SDK not initialized. Vertex Vector Search unavailable.")
except ImportError:
    logging.warning("google-cloud-aiplatform not found or version mismatch. Vertex Vector Search unavailable.")
    # Dummy classes for type hinting if SDK missing
    class MatchingEngineIndexEndpoint: pass; class Namespace: pass; class FindNeighborsRequest: pass; class FindNeighborsResponse: pass

# --- Assume Real Tool/Client Dependencies ---
try:
    # Use CONFIG_OBJ loaded in Cell 1
    if 'CONFIG_OBJ' not in globals() or not CONFIG_OBJ: raise NameError("CONFIG_OBJ not found or is None")
    if not isinstance(CONFIG_OBJ, EnhancedConfig): raise NameError("CONFIG_OBJ is not an EnhancedConfig instance")
    _config_obj = CONFIG_OBJ
    _real_dependencies = True
    logger.debug("Using real CONFIG_OBJ in Cell 3 (Reworked).")
except NameError as e:
    logger.critical(f"CRITICAL DEPENDENCY ERROR ({e}). Cannot proceed with Cell 3 logic.", exc_info=True)
    # Define minimal mocks to prevent immediate crashes, but functionality is broken
    _config_obj = None; _real_dependencies = False

logger = logging.getLogger('MIZ-OKI.KGAdapterTool')

# --- Data Pseudonymizer (Improved version from Cell 3 analysis) ---
class DataPseudonymizer:
    """Handles pseudonymization of sensitive data fields using SHA-256 hashing."""
    def __init__(self, salt: Optional[str]):
        if not salt or salt == "default_insecure_salt_replace_me_!!":
            raise ValueError("CRITICAL: Cannot initialize DataPseudonymizer without a valid salt. Set MIZ_SALT env var or secret.")
        self.salt = salt.encode('utf-8')
        # Define sensitive fields (case-insensitive matching recommended in practice)
        self.sensitive_fields = {
            "email", "phone", "ip_address", "name", "user_id", "firstName",
            "lastName", "customer_name", "address", "contact", "customer_email",
            "user_pseudo_id", # Added from GA4 example
            # Add other potentially sensitive fields based on data sources
        }
        self.logger = logging.getLogger('MIZ-OKI.Pseudonymizer')
        self.logger.info("DataPseudonymizer initialized.")

    def _hash(self, value: str) -> str:
        """Generates SHA-256 hash with salt."""
        return hashlib.sha256(self.salt + str(value).strip().lower().encode('utf-8')).hexdigest()

    def pseudonymize_value(self, key: str, value: Any) -> Any:
        """Pseudonymizes a single value if the key indicates sensitivity."""
        # Check key (case-insensitive recommended for robustness)
        is_sensitive = isinstance(key, str) and key.lower() in self.sensitive_fields
        # Pseudonymize only non-empty strings
        if is_sensitive and isinstance(value, str) and value.strip():
            return f"pseudo_{self._hash(value)[:16]}" # Truncated hash
        return value # Return original value otherwise

    def pseudonymize_dict(self, data: Dict) -> Dict:
        """Recursively pseudonymizes sensitive fields in a dictionary."""
        if not isinstance(data, dict):
            return data # Return non-dicts as is

        pseudonymized = {}
        for key, value in data.items():
            if isinstance(value, dict):
                # Recurse into nested dictionaries
                pseudonymized[key] = self.pseudonymize_dict(value)
            elif isinstance(value, list):
                # Process items in lists
                pseudonymized[key] = [
                    self.pseudonymize_dict(item) if isinstance(item, dict)
                    else self.pseudonymize_value(key, item) # Apply to list items based on parent key's sensitivity (might need refinement based on data structure)
                    for item in value
                ]
            else:
                # Apply pseudonymization to the value based on the key
                pseudonymized[key] = self.pseudonymize_value(key, value)
        return pseudonymized

# --- Graph Storage Adapter Interface (Protocol - Remains same) ---
class GraphStorageAdapter(Protocol):
    """Defines the interface for interacting with a graph storage backend."""
    async def connect(self) -> None: ...
    async def close(self) -> None: ...
    async def execute_query(self, query: str, parameters: Optional[Dict] = None, *, database: Optional[str] = None) -> List[Dict]: ...
    @asynccontextmanager
    async def transaction(self, database: Optional[str] = None) -> Any: ... # Yields AsyncSession or equivalent
    async def add_entity(self, entity_dict: Dict, source: str, transaction: Optional[Any] = None) -> Dict: ...
    async def add_relationship(self, rel_dict: Dict, transaction: Optional[Any] = None) -> bool: ...
    async def add_entities_bulk(self, entities: List[Dict], source: str, transaction: Optional[Any] = None) -> Dict: ...
    async def add_relationships_bulk(self, relationships: List[Dict], transaction: Optional[Any] = None) -> Dict: ...
    async def get_entity(self, mizId: str) -> Optional[Dict]: ...
    async def find_entity_by_hints(self, hints: Dict, transaction: Optional[Any] = None) -> Optional[str]: ...
    async def get_neighbors(self, mizId: str, relationship_type: Optional[str] = None, direction: str = "both", limit: int = 250) -> List[Dict]: ...
    async def find_path(self, start_node_hints: Dict, end_node_hints: Dict, relationship_types: Optional[List[str]] = None, max_depth: int = 5) -> Optional[List[Dict]]: ...
    async def get_schema(self) -> Dict: ...
    async def get_stats(self) -> Dict: ...
    # Vector Ops
    async def upsert_vector(self, vector_id: str, vector: List[float], metadata: Dict, namespace: Optional[str] = None) -> bool: ...
    async def search_vector_index(self, query_vector: List[float], k: int, namespace: Optional[str] = None, filter_dict: Optional[Dict] = None) -> List[Tuple[str, float, Dict]]: ...
    async def create_vector_namespace(self, namespace: str) -> bool: ... # May not be applicable to all DBs
    # XAI Storage
    async def save_decision_record(self, record: Dict) -> bool: ...
    async def retrieve_decision_record(self, decision_id: str) -> Optional[Dict]: ...

# --- Neo4j Async Adapter Implementation (Reworked) ---
class Neo4jAsyncAdapter(GraphStorageAdapter):
    """Async Adapter for Neo4j, including Vector DB routing and structured XAI storage."""
    def __init__(self, config: EnhancedConfig):
        if not config: raise ValueError("Configuration object is required for Neo4jAsyncAdapter.")
        self.config = config
        self._driver: Optional[AsyncDriver] = None
        self._vector_index_endpoint_client: Optional[MatchingEngineIndexEndpoint] = None # For Vertex Vector Search
        self.logger = logging.getLogger('MIZ-OKI.Neo4jAsyncAdapter')

        # Validate required config sections
        if not config.kg or not config.kg.neo4j:
            raise ConfigurationError("KG or Neo4j configuration missing in EnhancedConfig.")
        if config.kg.storage_type == "neo4j" and not NEO4J_AVAILABLE:
             raise ImportError("Neo4j configured as storage_type but 'neo4j' async driver not installed.")
        if config.kg.vector_db_type == "vertex_vector_search" and not VECTOR_DB_AVAILABLE:
             logger.warning("Vertex Vector Search configured but SDK is unavailable.")
        if config.kg.vector_db_type == "vertex_vector_search" and not config.kg.vector_db_endpoint:
             logger.warning("Vertex Vector Search configured but endpoint name (VERTEX_VECTOR_INDEX_ENDPOINT_NAME) is missing.")

        self.logger.info(f"Neo4jAsyncAdapter instance created. KG Storage: {config.kg.storage_type}, Vector DB: {config.kg.vector_db_type}")

    async def connect(self) -> None:
        """ Establish connection to Neo4j & potentially Vector DB asynchronously. """
        # Neo4j Connection
        if self.config.kg.storage_type == "neo4j" or self.config.kg.vector_db_type == "neo4j":
            if not self._driver or self._driver.closed:
                if not NEO4J_AVAILABLE: raise ConnectionError("Neo4j async driver not available.")
                neo4j_cfg = self.config.db.neo4j # Use db config section
                if not neo4j_cfg.uri or not neo4j_cfg.user or not neo4j_cfg.password:
                    raise ConnectionError("Neo4j connection details (URI, User, Password) missing in configuration.")
                try:
                    self.logger.info(f"Connecting to Neo4j async driver at {neo4j_cfg.uri}...")
                    self._driver = AsyncGraphDatabase.driver(
                        neo4j_cfg.uri,
                        auth=basic_auth(neo4j_cfg.user, neo4j_cfg.password),
                        max_connection_lifetime=neo4j_cfg.max_connection_lifetime,
                        connection_timeout=neo4j_cfg.connection_timeout,
                        max_connection_pool_size=50 # Example pool size
                    )
                    await self._driver.verify_connectivity()
                    self.logger.info(f"Neo4j async driver connected and verified for {neo4j_cfg.uri}")
                    await self._ensure_constraints_and_indices() # Ensure schema elements exist
                except neo4j_exceptions.AuthError:
                    self.logger.critical(f"Neo4j authentication failed for user '{neo4j_cfg.user}'. Check credentials.")
                    self._driver = None; raise ConnectionError("Neo4j authentication failed.")
                except neo4j_exceptions.ServiceUnavailable:
                     self.logger.critical(f"Neo4j service unavailable at {neo4j_cfg.uri}. Check DB status and URI.")
                     self._driver = None; raise ConnectionError("Neo4j service unavailable.")
                except Exception as e:
                    self.logger.critical(f"Neo4j async connection failed: {e}", exc_info=True)
                    self._driver = None; raise ConnectionError(f"Neo4j connection failed: {e}") from e

        # Vector DB Connection (Vertex AI Vector Search)
        if self.config.kg.vector_db_type == "vertex_vector_search":
            if self._vector_index_endpoint_client is None and VECTOR_DB_AVAILABLE:
                endpoint_name = self.config.kg.vector_db_endpoint
                if endpoint_name:
                    try:
                        self.logger.info(f"Connecting to Vertex AI Vector Search Index Endpoint: {endpoint_name}")
                        # Instantiation is synchronous
                        self._vector_index_endpoint_client = MatchingEngineIndexEndpoint(index_endpoint_name=endpoint_name)
                        # Optional: Add a test call here if needed, e.g., list deployed indexes (sync in thread)
                        # await asyncio.to_thread(self._vector_index_endpoint_client.list_deployed_indexes)
                        self.logger.info(f"Connected to Vertex AI Vector Search Index Endpoint.")
                    except Exception as vec_e:
                        logger.error(f"Failed to initialize Vertex AI Vector Search client: {vec_e}", exc_info=True)
                        self._vector_index_endpoint_client = None # Ensure client is None on failure
                else:
                    logger.warning("vertex_vector_search configured but endpoint name missing in config.kg.vector_db_endpoint.")
            elif not VECTOR_DB_AVAILABLE:
                logger.warning("Vertex AI Matching Engine SDK not available. Cannot connect.")

        elif self.config.kg.vector_db_type not in ["neo4j", "none"]:
           logger.warning(f"Vector DB type '{self.config.kg.vector_db_type}' configured but connection logic not implemented.")

    async def close(self) -> None:
        """Closes Neo4j driver connection asynchronously."""
        if self._driver:
            try:
                await self._driver.close()
                self.logger.info("Neo4j async connection closed.")
            except Exception as e:
                self.logger.error(f"Error closing Neo4j async connection: {e}")
            finally:
                self._driver = None
        # No explicit close needed for Vertex AI client library instances generally

    async def _ensure_constraints_and_indices(self) -> None:
        """Ensure necessary constraints and indices exist in Neo4j."""
        if not self._driver: return
        queries = [
            # Unique constraint on the primary identifier for Entities
            "CREATE CONSTRAINT unique_entity_mizId IF NOT EXISTS FOR (n:Entity) REQUIRE n.mizId IS UNIQUE",
            # Unique constraint for Decision Logs
            "CREATE CONSTRAINT unique_decision_id IF NOT EXISTS FOR (d:DecisionLog) REQUIRE d.decision_id IS UNIQUE",
            # Composite unique constraint for external entities (platform + original_id) - Adjust label if needed
            "CREATE CONSTRAINT unique_external_id IF NOT EXISTS FOR (e:ExternalEntity) REQUIRE (e.platform, e.original_id) IS UNIQUE",
            # Indices for faster lookups
            "CREATE INDEX entity_type_index IF NOT EXISTS FOR (n:Entity) ON (n.entity_type)",
            "CREATE INDEX entity_source_index IF NOT EXISTS FOR (n:Entity) ON (n.source)",
            "CREATE INDEX decision_timestamp_index IF NOT EXISTS FOR (d:DecisionLog) ON (d.timestamp)",
            "CREATE INDEX workflow_exec_id_index IF NOT EXISTS FOR (wf:WorkflowExecution) ON (wf.id)",
            "CREATE INDEX workflow_step_id_index IF NOT EXISTS FOR (ws:WorkflowStep) ON (ws.id)",
        ]
        # Add Neo4j vector index creation if configured
        if self.config.kg.vector_db_type == "neo4j":
             idx_name = self.config.kg.vector_index_name
             dims = self.config.kg.vector_dimensions
             if idx_name and dims > 0:
                 # Assuming embeddings are on :Entity nodes, property 'embedding'
                 queries.append(f"CREATE VECTOR INDEX {idx_name} IF NOT EXISTS FOR (n:Entity) ON (n.embedding) OPTIONS {{indexConfig: {{`vector.dimensions`: {dims}, `vector.similarity_function`: 'cosine'}}}}")
             else:
                 logger.warning("Neo4j vector index configured but name or dimensions missing/invalid.")

        try:
            async with self.transaction() as session: # Use the transaction context manager
                 async with session.begin_transaction() as tx: # Start a transaction
                     for query in queries:
                         try:
                             await tx.run(query)
                             logger.info(f"Applied/verified schema async: {query.split(' FOR')[0]}...")
                         except neo4j_exceptions.ClientError as e:
                             # Ignore errors indicating the constraint/index already exists
                             if "already exists" in str(e).lower() or "Constraint already created" in str(e) or "index already exists" in str(e).lower():
                                 logger.debug(f"Schema item likely exists: {query.split(' FOR')[0]}...")
                             else:
                                 raise # Re-raise other client errors
        except Exception as e:
            logger.warning(f"Failed to ensure schema constraints/indices async: {e}", exc_info=True)

    async def execute_query(self, query: str, parameters: Optional[Dict] = None, *, database: Optional[str] = "neo4j") -> List[Dict]:
        """Executes a read Cypher query asynchronously."""
        if not self._driver or self._driver.closed: await self.connect()
        if not self._driver: raise ConnectionError("Neo4j async driver not connected.")
        parameters = parameters or {}
        try:
            # Use execute_query for potentially simpler read operations or when explicit transaction management isn't needed
            results, summary, keys = await self._driver.execute_query(query, parameters, database_=database)
            # Convert Neo4j Records to dictionaries
            return [r.data() for r in results]
        except neo4j_exceptions.ClientError as e:
            logger.error(f"Cypher query syntax error async: {e}\nQuery: {query}\nParams: {parameters}")
            raise
        except neo4j_exceptions.TransientError as e:
            logger.warning(f"Neo4j transient error async (retrying might help): {e}\nQuery: {query}")
            raise # Or implement retry logic here/caller
        except Exception as e:
            logger.error(f"Cypher query failed async: {e}\nQuery: {query}", exc_info=True)
            raise

    @asynccontextmanager
    async def transaction(self, database: Optional[str] = "neo4j") -> AsyncSession:
        """Provides an asynchronous transaction context."""
        if not self._driver or self._driver.closed: await self.connect()
        if not self._driver: raise ConnectionError("Neo4j async driver not connected.")
        session: Optional[AsyncSession] = None
        try:
            session = self._driver.session(database=database)
            yield session # Yield the session; user manages tx with 'async with session.begin_transaction() as tx:'
        except Exception as e:
            logger.error(f"Exception creating/yielding Neo4j async session: {e}", exc_info=True)
            raise
        finally:
            if session:
                await session.close()

    def _build_merge_clause(self, hints: Dict, variable: str = 'n') -> Tuple[str, Dict, str]:
        """Builds MERGE clause based on hints (sync helper)."""
        params = {}
        merge_parts = []
        node_label = hints.get('type', 'Entity') # Default label
        # Ensure label safety
        safe_node_label = f"`{node_label}`" if not re.match(r'^[A-Za-z_][A-Za-z0-9_]*$', node_label) else node_label

        # Prioritize mizId if available
        if 'mizId' in hints and hints['mizId']:
            prop = 'mizId'
            param_name = f"{variable}_{prop}_hint"
            merge_parts.append(f"{variable}.{prop} = ${param_name}")
            params[param_name] = hints[prop]
        # Composite key for external entities
        elif hints.get('platform') and hints.get('original_id'):
            # Use a more specific label if possible, e.g., ExternalEntity
            if node_label == 'Entity': node_label = 'ExternalEntity'; safe_node_label = 'ExternalEntity'

            prop_plat = 'platform'
            param_plat = f"{variable}_{prop_plat}_hint"
            prop_id = 'original_id'
            param_id = f"{variable}_{prop_id}_hint"
            merge_parts.append(f"{variable}.{prop_plat} = ${param_plat}")
            merge_parts.append(f"{variable}.{prop_id} = ${param_id}")
            params[param_plat] = hints['platform']
            params[param_id] = hints['original_id']
        # Other potential unique identifiers
        elif hints.get('email'):
            prop = 'email'
            param_name = f"{variable}_{prop}_hint"; merge_parts.append(f"{variable}.{prop} = ${param_name}"); params[param_name] = hints[prop]
        elif hints.get('type') == 'Product' and hints.get('sku'):
            prop = 'sku'
            param_name = f"{variable}_{prop}_hint"; merge_parts.append(f"{variable}.{prop} = ${param_name}"); params[param_name] = hints[prop]
        else:
            raise ValueError(f"Insufficient hints for MERGE clause: {hints}. Need mizId, (platform, original_id), email, or (type=Product, sku).")

        merge_clause = f"MERGE ({variable}:{safe_node_label} {{ {', '.join(merge_parts)} }})"
        return merge_clause, params, node_label # Return the label used

    async def find_entity_by_hints(self, hints: Dict, transaction: Optional[AsyncSession] = None) -> Optional[str]:
        """Finds an entity's mizId based on hints using MATCH."""
        if not hints: return None
        match_parts = []; params = {}; node_label = hints.get('type'); safe_node_label = f"`{node_label}`" if node_label and not re.match(r'^[A-Za-z_][A-Za-z0-9_]*$', node_label) else node_label or 'Entity'

        # Build MATCH clause based on available hints (similar logic to _build_merge_clause)
        prop = None
        if 'mizId' in hints and hints['mizId']: prop = 'mizId'
        elif hints.get('platform') and hints.get('original_id'): prop = None # Composite handled below
        elif hints.get('email'): prop = 'email'
        elif hints.get('type') == 'Product' and hints.get('sku'): prop = 'sku'

        if prop:
            param_name = f"find_{prop}_hint"; match_parts.append(f"n.{prop} = ${param_name}"); params[param_name] = hints[prop]
        elif prop is None and hints.get('platform') and hints.get('original_id'):
            if node_label == 'Entity': safe_node_label = 'ExternalEntity' # Assume specific label for composite key
            prop_plat='platform'; param_plat=f"find_{prop_plat}_hint"; prop_id='original_id'; param_id=f"find_{prop_id}_hint"
            match_parts.append(f"n.{prop_plat} = ${param_plat}"); match_parts.append(f"n.{prop_id} = ${param_id}")
            params[param_plat] = hints['platform']; params[param_id] = hints['original_id']
        else:
            logger.warning(f"Insufficient hints for MATCH: {hints}. Cannot find entity.")
            return None

        query = f"MATCH (n:{safe_node_label} {{ {', '.join(match_parts)} }}) RETURN n.mizId AS mizId LIMIT 1"
        try:
            async def _run_find(tx_or_session):
                result = await tx_or_session.run(query, params)
                record = await result.single()
                return record['mizId'] if record and record['mizId'] else None

            if transaction: # If session is passed, run query directly
                 return await _run_find(transaction)
            else: # Manage session internally
                 async with self.transaction() as session:
                      return await _run_find(session)
        except Exception as e:
            logger.error(f"Error finding entity async by hints {hints}: {e}", exc_info=True)
            return None

    async def add_entity(self, entity_dict: Dict, source: str, transaction: Optional[AsyncSession] = None) -> Dict:
        """Adds or updates an entity in Neo4j asynchronously using MERGE."""
        if not isinstance(entity_dict, dict): raise TypeError("entity_dict must be a dictionary.")
        hints = entity_dict.get('_resolution_hints')
        entity_type = entity_dict.get('type') or (hints.get('type') if hints else 'Entity')
        if not hints: hints = {"type": entity_type} # Ensure hints exist

        # Ensure mizId exists, generate if needed
        mizId = hints.get('mizId') or entity_dict.get('mizId') or f"{entity_type}:{uuid.uuid4()}"
        hints['mizId'] = mizId # Ensure hints have the final mizId
        entity_dict['mizId'] = mizId # Ensure entity data has the final mizId

        # Prepare properties, excluding internal keys like _resolution_hints
        properties_to_set = {k: v for k, v in entity_dict.items() if not k.startswith('_')}
        properties_to_set['source'] = source
        now_iso = datetime.now(datetime.timezone.utc).isoformat()
        properties_to_set['updated_at'] = now_iso
        properties_to_set['entity_type'] = entity_type # Ensure type is set as property

        try:
            merge_clause, merge_params, used_label = self._build_merge_clause(hints, 'n')
            params = {**merge_params, 'props': properties_to_set, 'now': now_iso}
            # Ensure the primary label and potentially 'Entity' label are set
            labels_to_set = {f"`{used_label}`", "`Entity`"} # Use backticks for safety
            set_labels_clause = " SET n" + ":".join(labels_to_set)

            # Use ON CREATE and ON MATCH for efficient updates
            query = f"""
            {merge_clause}
            ON CREATE SET n = $props, n.created_at = $now
            ON MATCH SET n += $props
            {set_labels_clause}
            RETURN n.mizId AS mizId, n.created_at = $now AS isNew
            """

            async def _run_merge_in_tx(tx: AsyncTransaction) -> Dict:
                 result = await tx.run(query, params)
                 record = await result.single()
                 summary = await result.consume() # Consume result to get summary
                 nodes_created = summary.counters.nodes_created
                 props_set = summary.counters.properties_set
                 if record:
                     return {"success": True, "mizId": record["mizId"], "is_new": record["isNew"], "_nodes_created": nodes_created, "_props_set": props_set}
                 else:
                     self.logger.error(f"MERGE async op for {hints} returned no result. Query: {query}, Params: {params}")
                     return {"success": False, "error": "MERGE async op returned no result."}

            if transaction: # If session is passed, use it to begin transaction
                 async with transaction.begin_transaction() as tx:
                      return await _run_merge_in_tx(tx)
            else: # Manage session and transaction internally
                 async with self.transaction() as session:
                      # Use session.write_transaction for automatic retry on transient errors
                      return await session.write_transaction(_run_merge_in_tx)

        except ValueError as ve: # Catch errors from _build_merge_clause
             logger.error(f"Error preparing entity merge for {hints}: {ve}")
             return {"success": False, "mizId": mizId, "error": str(ve), "hints": hints}
        except Exception as e:
             logger.error(f"Error adding/updating entity async {hints}: {e}", exc_info=True)
             return {"success": False, "mizId": mizId, "error": str(e), "hints": hints}

    async def add_relationship(self, rel_dict: Dict, transaction: Optional[AsyncSession] = None) -> bool:
        """Adds or updates a relationship between two entities asynchronously."""
        source_hints = rel_dict.get('source_hints')
        target_hints = rel_dict.get('target_hints')
        rel_type = rel_dict.get('type')
        if not source_hints or not target_hints or not rel_type:
            raise ValueError("Missing 'source_hints', 'target_hints', or 'type' for relationship.")

        properties_to_set = {k: v for k, v in rel_dict.items() if k not in ['source_hints', 'target_hints', 'type']}
        now_iso = datetime.now(datetime.timezone.utc).isoformat()
        properties_to_set['updated_at'] = now_iso

        try:
            source_merge, source_params, source_label = self._build_merge_clause(source_hints, 'a')
            target_merge, target_params, target_label = self._build_merge_clause(target_hints, 'b')
            params = {**source_params, **target_params, 'rel_props': properties_to_set, 'now': now_iso}

            # Ensure labels are safe
            safe_rel_type = f"`{rel_type}`" if not re.match(r'^[A-Za-z_][A-Za-z0-9_]*$', rel_type) else rel_type
            safe_source_label = f"`{source_label}`" if not re.match(r'^[A-Za-z_][A-Za-z0-9_]*$', source_label) else source_label
            safe_target_label = f"`{target_label}`" if not re.match(r'^[A-Za-z_][A-Za-z0-9_]*$', target_label) else target_label

            # MERGE nodes first (ensures they exist) and set their primary labels, then MERGE relationship
            query = f"""
            {source_merge} SET a:{safe_source_label}
            {target_merge} SET b:{safe_target_label}
            MERGE (a)-[r:{safe_rel_type}]->(b)
            ON CREATE SET r = $rel_props, r.created_at = $now
            ON MATCH SET r += $rel_props, r.updated_at = $now
            RETURN count(r) as rel_count
            """

            async def _run_rel_merge_in_tx(tx: AsyncTransaction) -> bool:
                 result = await tx.run(query, params)
                 record = await result.single()
                 # Check if record exists and count is valid (>= 0)
                 return record is not None and record["rel_count"] >= 0

            if transaction:
                 async with transaction.begin_transaction() as tx:
                      return await _run_rel_merge_in_tx(tx)
            else:
                 async with self.transaction() as session:
                      return await session.write_transaction(_run_rel_merge_in_tx)

        except ValueError as ve: # Catch errors from _build_merge_clause
             logger.error(f"Error preparing relationship merge '{rel_type}' between {source_hints} and {target_hints}: {ve}")
             return False
        except Exception as e:
             logger.error(f"Error adding/updating relationship async '{rel_type}' between {source_hints} and {target_hints}: {e}", exc_info=True)
             return False

    async def add_entities_bulk(self, entities: List[Dict], source: str, transaction: Optional[AsyncSession] = None) -> Dict:
        """Adds/updates entities in bulk using UNWIND and MERGE (more robust than APOC)."""
        if not entities: return {"new": 0, "updated": 0, "failed": 0, "failures": []}
        batch_data = []; failures = []; start_time = time.monotonic(); now_iso = datetime.now(datetime.timezone.utc).isoformat()

        for entity_dict in entities:
            try:
                 if not isinstance(entity_dict, dict): raise TypeError("Entity item must be a dictionary.")
                 hints = entity_dict.get('_resolution_hints'); entity_type = entity_dict.get('type') or (hints.get('type') if hints else 'Entity')
                 if not hints: hints = {"type": entity_type}
                 mizId = hints.get('mizId') or entity_dict.get('mizId') or f"{entity_type}:{uuid.uuid4()}"; hints['mizId'] = mizId; entity_dict['mizId'] = mizId

                 properties_to_set = {k: v for k, v in entity_dict.items() if not k.startswith('_')}
                 properties_to_set['source'] = source; properties_to_set['updated_at'] = now_iso; properties_to_set['entity_type'] = entity_type

                 # Build merge keys based on hints (similar to _build_merge_clause but just the keys)
                 merge_props = {}
                 if 'mizId' in hints and hints['mizId']: merge_props['mizId'] = hints['mizId']
                 elif hints.get('platform') and hints.get('original_id'): merge_props['platform'] = hints['platform']; merge_props['original_id'] = hints['original_id']
                 # Add other unique key combinations if needed
                 else: raise ValueError(f"Insufficient hints for bulk MERGE: {hints}")

                 used_label = hints.get('type', 'Entity'); safe_label = f"`{used_label}`" if not re.match(r'^[A-Za-z_][A-Za-z0-9_]*$', used_label) else used_label
                 labels_to_set = [safe_label, "`Entity`"] # Ensure Entity label is also added

                 batch_data.append({
                     "merge_props": merge_props,
                     "set_props": properties_to_set,
                     "labels": labels_to_set
                 })
            except Exception as prep_e:
                 failures.append({"data": entity_dict, "error": str(prep_e)})

        if not batch_data:
            duration=time.monotonic()-start_time; self.logger.error(f"Bulk add entities async failed: No valid data prepared ({len(failures)} failures) in {duration:.3f}s."); return {"new": 0, "updated": 0, "failed": len(failures), "failures": failures}

        # Cypher query using UNWIND and MERGE
        query = """
        UNWIND $batch AS item
        MERGE (n {mizId: item.merge_props.mizId}) // Assuming mizId is the primary merge key for simplicity here
        ON CREATE SET n = item.set_props, n.created_at = $now
        ON MATCH SET n += item.set_props
        // Dynamically set labels - requires APOC if labels are truly dynamic per item
        // Standard Cypher requires labels known at query time. If labels vary, use APOC or multiple queries.
        // Assuming labels are consistent for the batch for now, or handle via APOC:
        // WITH n, item CALL apoc.create.addLabels(n, item.labels) YIELD node
        // RETURN node // If using APOC
        SET n:`Entity` // Set base label if not using APOC for dynamic labels
        WITH n, item.set_props.created_at = $now AS is_new
        RETURN sum(CASE WHEN is_new THEN 1 ELSE 0 END) AS newCount,
               sum(CASE WHEN NOT is_new THEN 1 ELSE 0 END) AS updatedCount
        """
        # Note: The standard Cypher SET n:Label syntax doesn't work well with dynamic labels from the list.
        # If labels truly vary per item in the batch, APOC's `apoc.create.addLabels` is needed, or run separate queries per label combination.
        # The query above assumes `Entity` is the main label and merges primarily on `mizId`. Adjust merge key if needed.

        params = {"batch": batch_data, "now": now_iso}; new_count = 0; updated_count = 0
        try:
            async def _run_bulk_entities_in_tx(tx: AsyncTransaction) -> Tuple[int, int]:
                 result = await tx.run(query, params)
                 record = await result.single()
                 # Handle potential null counts if the query returns nothing
                 nc = record["newCount"] if record and record["newCount"] is not None else 0
                 uc = record["updatedCount"] if record and record["updatedCount"] is not None else 0
                 return nc, uc

            if transaction:
                 async with transaction.begin_transaction() as tx:
                      new_count, updated_count = await _run_bulk_entities_in_tx(tx)
            else:
                 async with self.transaction() as session:
                      new_count, updated_count = await session.write_transaction(_run_bulk_entities_in_tx)

            duration = time.monotonic() - start_time
            self.logger.info(f"Bulk add entities async completed ({duration:.3f}s). New: {new_count}, Updated: {updated_count}, Prep Failed: {len(failures)}")
            return {"new": new_count, "updated": updated_count, "failed": len(failures), "failures": failures}
        except Exception as e:
            duration = time.monotonic() - start_time
            self.logger.error(f"Bulk add entities async failed critically ({duration:.3f}s): {e}", exc_info=True)
            # Add all items intended for the batch to failures on critical DB error
            for item in batch_data: failures.append({"data": item.get("set_props", {}), "error": f"Bulk DB operation failed: {e}"})
            return {"new": 0, "updated": 0, "failed": len(failures), "failures": failures}

    async def add_relationships_bulk(self, relationships: List[Dict], transaction: Optional[AsyncSession] = None) -> Dict:
        """Adds/updates relationships in bulk using UNWIND and MERGE."""
        if not relationships: return {"loaded": 0, "failed": 0, "failures": []}
        batch_data = []; failures = []; start_time = time.monotonic(); now_iso = datetime.now(datetime.timezone.utc).isoformat()

        for rel_dict in relationships:
            try:
                source_hints = rel_dict.get('source_hints'); target_hints = rel_dict.get('target_hints'); rel_type = rel_dict.get('type')
                if not source_hints or not target_hints or not rel_type: raise ValueError("Missing hints or type.")

                # Assume hints contain mizId for bulk operations
                source_mizId = source_hints.get('mizId'); target_mizId = target_hints.get('mizId')
                if not source_mizId or not target_mizId: raise ValueError("Hints must contain mizId for bulk relationship merge.")

                properties_to_set = {k: v for k, v in rel_dict.items() if k not in ['source_hints', 'target_hints', 'type']}
                properties_to_set['updated_at'] = now_iso

                safe_rel_type = f"`{rel_type}`" if not re.match(r'^[A-Za-z_][A-Za-z0-9_]*$', rel_type) else rel_type

                batch_data.append({
                    "source_mizId": source_mizId,
                    "target_mizId": target_mizId,
                    "rel_type": safe_rel_type,
                    "rel_props": properties_to_set
                })
            except Exception as e:
                failures.append({"data": rel_dict, "error": str(e)})

        if not batch_data:
            duration=time.monotonic()-start_time; self.logger.error(f"Bulk add relationships async failed: No valid data prepared ({len(failures)} failures) in {duration:.3f}s."); return {"loaded": 0, "failed": len(failures), "failures": failures}

        # Use standard MERGE within UNWIND
        query = """
        UNWIND $batch AS item
        MATCH (a:Entity {mizId: item.source_mizId})
        MATCH (b:Entity {mizId: item.target_mizId})
        MERGE (a)-[r:`""" + safe_rel_type + """`]->(b) // Use dynamic relationship type safely
        ON CREATE SET r = item.rel_props, r.created_at = $now
        ON MATCH SET r += item.rel_props, r.updated_at = $now
        RETURN count(r) as total_processed_rels_in_batch
        """
        # Note: This query assumes all relationships in the batch have the SAME type (`safe_rel_type`).
        # If types vary within the batch, APOC's `apoc.merge.relationship` is needed, or run separate batches per type.
        # Example using APOC (if types vary):
        # query_apoc = """
        # UNWIND $batch AS item
        # MATCH (a:Entity {mizId: item.source_mizId})
        # MATCH (b:Entity {mizId: item.target_mizId})
        # CALL apoc.merge.relationship(a, item.rel_type, {}, item.rel_props, b, {}) YIELD rel
        # // Add ON CREATE/MATCH logic if needed via separate SET clauses with WHERE rel.created_at IS NULL etc.
        # SET rel.updated_at = $now
        # RETURN count(rel) as total_processed_rels_in_batch
        # """

        params = {"batch": batch_data, "now": now_iso}; processed_rels_count = 0
        try:
            async def _run_bulk_rels_in_tx(tx: AsyncTransaction) -> int:
                 result = await tx.run(query, params) # Use standard query or query_apoc
                 # Aggregate counts if query returns multiple rows (unlikely with count)
                 total = 0
                 async for record in result:
                     total += record["total_processed_rels_in_batch"] or 0
                 return total

            if transaction:
                 async with transaction.begin_transaction() as tx:
                      processed_rels_count = await _run_bulk_rels_in_tx(tx)
            else:
                 async with self.transaction() as session:
                      processed_rels_count = await session.write_transaction(_run_bulk_rels_in_tx)

            duration = time.monotonic() - start_time
            self.logger.info(f"Bulk add relationships async completed ({duration:.3f}s). Loaded/Updated: {processed_rels_count}, Prep Failed: {len(failures)}")
            return {"loaded": processed_rels_count, "failed": len(failures), "failures": failures}
        except Exception as e:
            duration = time.monotonic() - start_time
            self.logger.error(f"Bulk add relationships async failed critically ({duration:.3f}s): {e}", exc_info=True)
            for item_data in batch_data: failures.append({"data": item_data.get("rel_props",{}), "error": f"Bulk DB operation failed: {e}"})
            return {"loaded": 0, "failed": len(failures), "failures": failures}

    async def get_entity(self, mizId: str) -> Optional[Dict]:
        """Retrieves a single entity by its mizId."""
        query = "MATCH (n:Entity {mizId: $mizId}) RETURN properties(n) AS props"
        result = await self.execute_query(query, {"mizId": mizId})
        return result[0]['props'] if result else None

    async def get_neighbors(self, mizId: str, relationship_type: Optional[str] = None, direction: str = "both", limit: int = 250) -> List[Dict]:
        """Retrieves neighbors of an entity."""
        if direction == "outgoing": arrow = "-[r]->"
        elif direction == "incoming": arrow = "<-[r]-"
        else: arrow = "-[r]-" # Default to both

        params = {"mizId": mizId, "limit": limit}
        rel_match = ""
        if relationship_type:
            safe_rel_type = f"`{relationship_type}`" if not re.match(r'^[A-Za-z_][A-Za-z0-9_]*$', relationship_type) else relationship_type
            rel_match = f":{safe_rel_type}"

        arrow = arrow.replace('[r]', f'[r{rel_match}]')

        query = f"""
        MATCH (a:Entity {{mizId: $mizId}}){arrow}(b:Entity)
        RETURN b.mizId AS neighborId, properties(b) AS neighborProps, type(r) AS relationshipType, properties(r) AS relationshipProps
        ORDER BY relationshipType, neighborId
        LIMIT $limit
        """
        return await self.execute_query(query, params)

    async def find_path(self, start_node_hints: Dict, end_node_hints: Dict, relationship_types: Optional[List[str]] = None, max_depth: int = 5) -> Optional[List[Dict]]:
        """Finds the shortest path between two nodes."""
        try:
            # Resolve hints to mizIds asynchronously
            start_mizId_task = self.find_entity_by_hints(start_node_hints)
            end_mizId_task = self.find_entity_by_hints(end_node_hints)
            start_mizId, end_mizId = await asyncio.gather(start_mizId_task, end_mizId_task)

            if not start_mizId or not end_mizId:
                logger.warning(f"Cannot find path async: Start/end node hints unresolved. Start: {start_node_hints} -> {start_mizId}, End: {end_node_hints} -> {end_mizId}.")
                return None
            if start_mizId == end_mizId:
                logger.info(f"Start and end node are the same ({start_mizId}). Path length is 0.")
                return [] # Path of length 0

            # Build relationship filter string
            rel_filter = "*" # Default: any relationship type
            if relationship_types:
                safe_rel_types = [f"`{rt}`" if not re.match(r'^[A-Za-z_][A-Za-z0-9_]*$', rt) else rt for rt in relationship_types]
                rel_filter = "|".join([f":{srt}" for srt in safe_rel_types])

            query = f"""
            MATCH (a:Entity {{mizId: $start_mizId}}), (b:Entity {{mizId: $end_mizId}})
            MATCH p = shortestPath((a)-[{rel_filter}*1..{max_depth}]-(b))
            RETURN [node in nodes(p) | properties(node)] AS nodes,
                   [rel in relationships(p) | {{type: type(rel), props: properties(rel)}}] AS relationships
            LIMIT 1
            """
            params = {"start_mizId": start_mizId, "end_mizId": end_mizId}
            result = await self.execute_query(query, params)

            if result:
                # Return the path structure containing list of nodes and relationships
                return result[0]
            else:
                logger.info(f"No path found async between {start_mizId} and {end_mizId} (Max Depth: {max_depth}, Types: {relationship_types}).")
                return None
        except Exception as e:
            logger.error(f"Error finding path async: {e}", exc_info=True)
            return None

    async def get_schema(self) -> Dict:
        """Retrieves the graph schema (labels, relationship types, property keys)."""
        schema = {"labels": [], "relationship_types": [], "property_keys": []}
        try:
            labels_query = "CALL db.labels() YIELD label RETURN collect(label) AS labels"
            rels_query = "CALL db.relationshipTypes() YIELD relationshipType RETURN collect(relationshipType) AS relationshipTypes"
            props_query = "CALL db.propertyKeys() YIELD propertyKey RETURN collect(propertyKey) AS propertyKeys"

            labels_result, rels_result, props_result = await asyncio.gather(
                self.execute_query(labels_query),
                self.execute_query(rels_query),
                self.execute_query(props_query),
                return_exceptions=True
            )

            if not isinstance(labels_result, Exception) and labels_result: schema["labels"] = labels_result[0].get('labels', [])
            if not isinstance(rels_result, Exception) and rels_result: schema["relationship_types"] = rels_result[0].get('relationshipTypes', [])
            if not isinstance(props_result, Exception) and props_result: schema["property_keys"] = props_result[0].get('propertyKeys', [])

        except Exception as e:
            logger.error(f"Error getting graph schema async: {e}", exc_info=True)
            schema["error"] = str(e)
        return schema

    async def get_stats(self) -> Dict:
        """Retrieves basic graph statistics."""
        stats = {"nodes": -1, "edges": -1}
        try:
            # Use count store for potentially faster results if available and accurate
            # stats_query = "CALL db.stats.retrieve('GRAPH_COUNTS') YIELD nodes, relationships RETURN nodes, relationships"
            # result = await self.execute_query(stats_query)
            # if result: stats = {"nodes": result[0].get('nodes', -1), "edges": result[0].get('relationships', -1)}

            # Fallback to MATCH COUNT if stats procedure fails or gives invalid results
            # if stats["nodes"] <= 0 or stats["edges"] <= 0:
            logger.debug("Falling back to MATCH COUNT for graph stats.")
            nodes_query = "MATCH (n) RETURN count(n) AS nodeCount"
            rels_query = "MATCH ()-[r]->() RETURN count(r) AS relationshipCount"
            node_result, rel_result = await asyncio.gather(
                self.execute_query(nodes_query),
                self.execute_query(rels_query),
                return_exceptions=True
            )
            if not isinstance(node_result, Exception) and node_result: stats["nodes"] = node_result[0].get('nodeCount', -1)
            if not isinstance(rel_result, Exception) and rel_result: stats["edges"] = rel_result[0].get('relationshipCount', -1)

        except Exception as e:
            logger.error(f"Error getting graph stats async: {e}", exc_info=True)
            stats["error"] = str(e)
        return stats

    # --- Vector Operations (Reworked) ---
    async def upsert_vector(self, vector_id: str, vector: List[float], metadata: Dict, namespace: Optional[str] = None) -> bool:
        """Upserts a vector into the configured vector database."""
        db_type = self.config.kg.vector_db_type
        namespace = namespace or "default" # Default namespace if not provided
        self.logger.debug(f"Upserting vector async. ID: {vector_id}, DB: {db_type}, Namespace: {namespace}")

        if db_type == "vertex_vector_search":
            if self._vector_index_endpoint_client and VECTOR_DB_AVAILABLE:
                try:
                    # Format metadata for Vertex AI Vector Search restricts
                    # Note: Vertex AI uses 'allow_list' for filtering. Metadata keys become restrict namespaces.
                    restricts = []
                    if metadata:
                        for key, value in metadata.items():
                             # Ensure value is a list of strings for allow_list
                             allow_list_values = [str(v) for v in value] if isinstance(value, list) else [str(value)]
                             restricts.append(Namespace(name=key, allow_list=allow_list_values))

                    datapoint = aiplatform.matching_engine.Datapoint(
                        datapoint_id=vector_id,
                        feature_vector=vector,
                        restricts=restricts
                    )
                    # Use upsert_datapoints (sync SDK method) in a thread
                    await asyncio.to_thread(
                        self._vector_index_endpoint_client.upsert_datapoints,
                        datapoints=[datapoint]
                    )
                    self.logger.info(f"Vertex Vector Search upsert successful for ID: {vector_id}")
                    return True
                except Exception as e:
                    logger.error(f"Vertex Vector Search upsert failed for ID {vector_id}: {e}", exc_info=True)
                    return False
            else:
                logger.error("Vertex Vector Search client not initialized or SDK unavailable for upsert.")
                return False

        elif db_type == "neo4j":
            # Assumes vector_id corresponds to a mizId of an :Entity node
            return await self.add_node_embedding(mizId=vector_id, embedding=vector, embedding_property=metadata.get("embedding_property", "embedding"))

        else:
            logger.error(f"Vector upsert not supported for db_type: '{db_type}'")
            return False

    async def search_vector_index(self, query_vector: List[float], k: int, namespace: Optional[str] = None, filter_dict: Optional[Dict] = None) -> List[Tuple[str, float, Dict]]:
        """Searches the vector index asynchronously."""
        db_type = self.config.kg.vector_db_type
        namespace = namespace or "default"
        self.logger.debug(f"Searching vector index async. K={k}, DB: {db_type}, Namespace: {namespace}, Filter: {filter_dict}")

        if db_type == "vertex_vector_search":
            if self._vector_index_endpoint_client and VECTOR_DB_AVAILABLE:
                try:
                    # Prepare filters for Vertex AI ('restricts')
                    search_restricts = []
                    if filter_dict:
                         for key, value in filter_dict.items():
                              allow_list_values = [str(v) for v in value] if isinstance(value, list) else [str(value)]
                              search_restricts.append(Namespace(name=key, allow_list=allow_list_values))

                    # Use find_neighbors (sync SDK method) in a thread
                    # Note: Specify deployed_index_id if multiple indexes are deployed to the endpoint
                    deployed_index_id = os.getenv("VERTEX_VECTOR_DEPLOYED_INDEX_ID") # Get from env if needed
                    if not deployed_index_id:
                        logger.warning("VERTEX_VECTOR_DEPLOYED_INDEX_ID not set, assuming only one index deployed to endpoint.")

                    response: FindNeighborsResponse = await asyncio.to_thread(
                        self._vector_index_endpoint_client.find_neighbors,
                        queries=[query_vector],
                        deployed_index_id=deployed_index_id, # Pass deployed index ID
                        num_neighbors=k,
                        filter=search_restricts
                    )

                    # Parse response: response[0] contains neighbors for the first query
                    neighbors = response[0].neighbors if response and response[0].neighbors else []
                    # Return format: List[Tuple[vector_id, score, metadata_dict]]
                    # Vertex AI find_neighbors doesn't return metadata directly. Metadata is usually stored elsewhere (like KG).
                    # Score is 1.0 - distance for cosine similarity.
                    results = [(match.datapoint.datapoint_id, 1.0 - match.distance, {"source": "vertex_vector_search"}) for match in neighbors]
                    self.logger.info(f"Vertex Vector Search successful. Found {len(results)} neighbors.")
                    return results
                except Exception as e:
                    logger.error(f"Vertex Vector Search failed: {e}", exc_info=True)
                    return []
            else:
                logger.error("Vertex Vector Search client not initialized or SDK unavailable for search.")
                return []

        elif db_type == "neo4j":
             index_name = self.config.kg.vector_index_name
             if index_name and self._driver:
                 # Call internal Neo4j vector search helper
                 results_neo = await self._neo4j_vector_search(query_vector, index_name, k, filter_dict)
                 # Neo4j vector search typically returns node IDs and scores. Metadata needs separate fetch.
                 # Fetch metadata separately if needed, e.g., using get_entity
                 # For now, return placeholder metadata
                 return [(nid, score, {"source": "neo4j"}) for nid, score in results_neo]
             else:
                 logger.error("Neo4j vector index name not configured or driver unavailable.")
                 return []
        else:
            logger.error(f"Vector search not supported for db_type: '{db_type}'")
            return []

    async def create_vector_namespace(self, namespace: str) -> bool:
        """Creates a namespace (if applicable to the vector DB). Placeholder."""
        db_type = self.config.kg.vector_db_type
        self.logger.info(f"Placeholder: Creating vector namespace '{namespace}' for DB type '{db_type}'.")
        # Implementation depends heavily on the specific vector DB.
        # Vertex AI Vector Search uses restricts, not explicit namespaces in this way.
        # Neo4j doesn't have namespaces in the same way either.
        return True # Simulate success

    # --- Neo4j Specific Vector Methods (Internal Helpers - Async) ---
    async def add_node_embedding(self, mizId: str, embedding: List[float], embedding_property: str = "embedding", transaction: Optional[AsyncSession] = None) -> bool:
        """Sets the embedding property on a specific node asynchronously."""
        if not self._driver: return False
        query = f"MATCH (n:Entity {{mizId: $mizId}}) SET n.`{embedding_property}` = $embedding"
        params = {"mizId": mizId, "embedding": embedding}
        try:
            async def _run_set_embedding(tx: AsyncTransaction) -> bool:
                 result = await tx.run(query, params)
                 summary = await result.consume()
                 return summary.counters.properties_set > 0

            if transaction:
                 async with transaction.begin_transaction() as tx: return await _run_set_embedding(tx)
            else:
                 async with self.transaction() as session: return await session.write_transaction(_run_set_embedding)
        except Exception as e:
            logger.error(f"Failed to set embedding for node {mizId}: {e}", exc_info=True)
            return False

    async def _neo4j_vector_search(self, query_vector: List[float], index_name: str, k: int = 5, filter_dict: Optional[Dict] = None) -> List[Tuple[str, float]]:
        """Performs vector similarity search using Neo4j's vector index."""
        if not self._driver: return []
        # Basic query structure - filtering needs to be added based on filter_dict
        # Example filter: WHERE n.category = 'some_category'
        filter_clause = ""
        params = {"queryVector": query_vector, "k": k}
        if filter_dict:
            filter_parts = []
            for i, (key, value) in enumerate(filter_dict.items()):
                param_name = f"filter_val_{i}"
                filter_parts.append(f"n.`{key}` = ${param_name}")
                params[param_name] = value
            if filter_parts:
                filter_clause = "WHERE " + " AND ".join(filter_parts)

        # Ensure index_name is safe if needed, though usually predefined
        safe_index_name = index_name # Add validation/escaping if index names can be dynamic/unsafe

        query = f"""
        CALL db.index.vector.queryNodes('{safe_index_name}', $k, $queryVector) YIELD node, score
        {filter_clause}
        RETURN node.mizId AS mizId, score
        """
        try:
            results = await self.execute_query(query, params)
            return [(r['mizId'], r['score']) for r in results]
        except Exception as e:
            logger.error(f"Neo4j vector search failed for index '{safe_index_name}': {e}", exc_info=True)
            return []

    # --- XAI Storage Methods (Reworked for Structured Storage - Async) ---
    async def save_decision_record(self, record: Dict) -> bool:
        """Saves a decision record with structured nodes and relationships in Neo4j asynchronously."""
        decision_id = record.get('decision_id')
        if not decision_id:
            logger.error("Cannot save decision log: missing 'decision_id'.")
            return False
        if not self._driver:
             logger.error(f"Cannot save decision log {decision_id}: Neo4j driver unavailable.")
             return False

        logger.debug(f"Saving structured decision log async for ID: {decision_id}")
        try:
            # Prepare parameters, ensuring JSON serializability for complex fields
            def safe_json_dumps(data):
                try: return json.dumps(data, default=str)
                except TypeError: return json.dumps(str(data)) # Fallback

            params = {
                "decision_id": decision_id,
                "component": record.get("component"),
                "timestamp_iso": record.get("timestamp", datetime.now(datetime.timezone.utc).isoformat()),
                "model_used": record.get("model_used"),
                "workflow_exec_id": record.get("workflow_execution_id"),
                "workflow_step_id": record.get("workflow_step_id"),
                "decision_json": safe_json_dumps(record.get("decision", {})),
                "inputs_json": safe_json_dumps(record.get("inputs", {})),
                "outputs_json": safe_json_dumps(record.get("outputs", {})),
                "context_json": safe_json_dumps(record.get("context", {})),
                "cot": record.get("chain_of_thought", []) # Store list directly
            }

            # Cypher query to create/update nodes and relationships
            query = """
            MERGE (log:DecisionLog {decision_id: $decision_id})
            ON CREATE SET log.created_at = datetime($timestamp_iso)
            SET log.component = $component,
                log.timestamp = datetime($timestamp_iso),
                log.model_used = $model_used,
                log.decision_json = $decision_json,
                log.inputs_json = $inputs_json,
                log.outputs_json = $outputs_json,
                log.context_json = $context_json,
                log.chain_of_thought = $cot

            // Link to Workflow Execution if ID provided
            WITH log
            WHERE $workflow_exec_id IS NOT NULL AND $workflow_exec_id <> ""
            MERGE (wfExec:WorkflowExecution {id: $workflow_exec_id})
              ON CREATE SET wfExec.first_seen = datetime()
            MERGE (log)-[:PART_OF_EXECUTION]->(wfExec)

            // Link to Workflow Step if ID provided
            WITH log, wfExec
            WHERE $workflow_step_id IS NOT NULL AND $workflow_step_id <> "" AND wfExec IS NOT NULL
            MERGE (wfStep:WorkflowStep {id: $workflow_step_id})
              ON CREATE SET wfStep.first_seen = datetime()
            // Ensure step is linked to execution (might be created elsewhere)
            MERGE (wfStep)-[:PART_OF_EXECUTION]->(wfExec)
            MERGE (log)-[:EXECUTED_STEP]->(wfStep)

            RETURN count(log) as count
            """

            async def _run_save_decision(tx: AsyncTransaction) -> bool:
                 result = await tx.run(query, params)
                 record = await result.single()
                 summary = await result.consume()
                 # Check if the node was created or properties were set
                 success = (record is not None and record["count"] == 1) or summary.counters.properties_set > 0
                 return success

            async with self.transaction() as session:
                success = await session.write_transaction(_run_save_decision)

            if success: logger.debug(f"Saved/Updated structured decision log {decision_id}.")
            else: logger.error(f"Failed to save/update structured decision log {decision_id}.")
            return success
        except Exception as e:
            logger.error(f"Failed to save structured decision record {decision_id} to KG async: {e}", exc_info=True)
            return False

    async def retrieve_decision_record(self, decision_id: str) -> Optional[Dict]:
        """Retrieves a structured decision record from Neo4j asynchronously."""
        if not self._driver:
             logger.error(f"Cannot retrieve decision log {decision_id}: Neo4j driver unavailable.")
             return None
        logger.info(f"Retrieving structured decision log async for ID: {decision_id}")
        query = """
        MATCH (log:DecisionLog {decision_id: $decision_id})
        OPTIONAL MATCH (log)-[:EXECUTED_STEP]->(step:WorkflowStep)
        OPTIONAL MATCH (log)-[:PART_OF_EXECUTION]->(exec:WorkflowExecution)
        RETURN
            properties(log) AS log_props,
            properties(step) AS step_props,
            properties(exec) AS exec_props
        LIMIT 1
        """
        params = {"decision_id": decision_id}
        try:
            results = await self.execute_query(query, params)
            if not results:
                logger.warning(f"Decision log {decision_id} not found in KG.")
                return None

            log_data = results[0].get("log_props", {})
            step_data = results[0].get("step_props")
            exec_data = results[0].get("exec_props")

            # Reconstruct the record, parsing JSON properties safely
            record = {
                "decision_id": log_data.get("decision_id"),
                "timestamp": log_data.get("timestamp"), # Neo4j driver handles datetime conversion
                "component": log_data.get("component"),
                "model_used": log_data.get("model_used"),
                "chain_of_thought": log_data.get("chain_of_thought", []),
                "workflow_execution_id": exec_data.get("id") if exec_data else log_data.get("workflow_execution_id"),
                "workflow_step_id": step_data.get("id") if step_data else log_data.get("workflow_step_id"),
            }
            for json_field in ["decision", "inputs", "outputs", "context"]:
                json_str = log_data.get(f"{json_field}_json")
                if json_str:
                    try: record[json_field] = json.loads(json_str)
                    except json.JSONDecodeError: record[json_field] = {"error": f"Failed to parse {json_field} JSON", "raw": json_str}
                else: record[json_field] = {} # Default to empty dict if field missing

            return record
        except Exception as e:
            logger.error(f"Failed to retrieve structured decision record {decision_id} from KG async: {e}", exc_info=True)
            return None


# --- KnowledgeGraphToolService API Layer (Conceptual Reworked Async) ---
# This class wraps the adapter and provides the API endpoints (e.g., for FastAPI)
# It handles MIZ OKI payload parsing, validation, pseudonymization, and response formatting.

# --- Pydantic Models for API Payloads (Conceptual - Requires FastAPI/Pydantic) ---
if FASTAPI_AVAILABLE:
    class MizOkiBase(BaseModel):
        miz_oki_version: str = Field(default_factory=lambda: _config_obj.miz_oki_schema_version if _config_obj else "unknown")
        request_id: str = Field(default_factory=lambda: f"req_{uuid.uuid4().hex[:8]}")
        trace_id: Optional[str] = None
        workflow_execution_id: Optional[str] = None
        step_id: Optional[str] = None
        timestamp: str = Field(default_factory=lambda: datetime.now(datetime.timezone.utc).isoformat())
        source_component: Optional[str] = None
        target_component: Optional[str] = None
        security_context: Optional[Dict[str, Any]] = None # For potential auth info
        metadata: Optional[Dict[str, Any]] = None

    class MizOkiRequest(MizOkiBase):
        payload: Dict[str, Any]

    class MizOkiResponse(MizOkiBase):
        status: str # e.g., "success", "error", "partial_success", "not_found", "bad_request"
        payload: Optional[Dict[str, Any]] = None
        error_details: Optional[List[Dict[str, Any]]] = None

    # Example specific payload models (can be added for stricter validation)
    class EntityPayload(BaseModel):
        entity_data: Dict[str, Any]
        source: Optional[str] = "api"

    class BulkEntitiesPayload(BaseModel):
        entities: List[Dict[str, Any]]
        source: Optional[str] = "bulk_api"

    class RelationshipPayload(BaseModel):
        relationship_data: Dict[str, Any]

    class BulkRelationshipsPayload(BaseModel):
        relationships: List[Dict[str, Any]]

    class HintsPayload(BaseModel):
        hints: Dict[str, Any]

    class NeighborsPayload(BaseModel):
        mizId: str
        relationship_type: Optional[str] = None
        direction: Optional[str] = "both"
        limit: Optional[int] = 250

    class PathPayload(BaseModel):
        start_node_hints: Dict[str, Any]
        end_node_hints: Dict[str, Any]
        relationship_types: Optional[List[str]] = None
        max_depth: Optional[int] = 5

    class VectorUpsertPayload(BaseModel):
        vector_id: str
        vector: List[float]
        metadata: Dict[str, Any]
        namespace: Optional[str] = None

    class VectorSearchPayload(BaseModel):
        query_vector: List[float]
        k: int
        namespace: Optional[str] = None
        filter_dict: Optional[Dict[str, Any]] = None
        index_name: Optional[str] = None # Allow overriding default index

    class DecisionRecordPayload(BaseModel):
        record: Dict[str, Any]

# --- End Pydantic Models ---

class KnowledgeGraphToolService:
    """ Conceptual Service wrapping the GraphStorageAdapter. Handles MIZ OKI Payloads. """
    def __init__(self, config: EnhancedConfig):
        if not config: raise InitializationError("EnhancedConfig is required for KnowledgeGraphToolService.")
        self.config = config
        self.adapter: Optional[GraphStorageAdapter] = None
        self.pseudonymizer: Optional[DataPseudonymizer] = None
        self.logger = logging.getLogger('MIZ-OKI.KGToolService')
        self._adapter_initialized = False
        self._initialization_lock = asyncio.Lock()

    async def initialize_adapter(self):
        """Initializes the backend adapter (Neo4j) and pseudonymizer."""
        async with self._initialization_lock:
            if self._adapter_initialized: return
            try:
                # Initialize Adapter
                if self.config.kg.storage_type == "neo4j":
                    if not NEO4J_AVAILABLE: raise InitializationError("Neo4j driver not available, cannot initialize Neo4j adapter.")
                    self.adapter = Neo4jAsyncAdapter(self.config)
                # elif self.config.kg.storage_type == "other": self.adapter = OtherDbAsyncAdapter(self.config) # Example
                else:
                    raise InitializationError(f"Unsupported kg.storage_type: {self.config.kg.storage_type}")

                await self.adapter.connect() # Connect during initialization

                # Initialize Pseudonymizer
                if not self.config.miz_salt: raise InitializationError("MIZ_SALT is missing, cannot initialize pseudonymizer.")
                self.pseudonymizer = DataPseudonymizer(self.config.miz_salt)

                self._adapter_initialized = True
                self.logger.info(f"KG Tool Service adapter initialized: {type(self.adapter).__name__}")
            except Exception as e:
                self.logger.critical(f"KG Tool Service: CRITICAL Error initializing adapter: {e}", exc_info=True)
                self.adapter = None # Ensure adapter is None on failure
                self.pseudonymizer = None
                self._adapter_initialized = False
                raise InitializationError(f"Failed to initialize KG Adapter: {e}") from e

    async def close_adapter(self):
        """Closes the backend adapter connection."""
        if hasattr(self, 'adapter') and self.adapter:
            await self.adapter.close()
            self._adapter_initialized = False
            self.logger.info("KG Tool Service adapter closed.")

    async def ensure_adapter(self):
        """Ensures the adapter is initialized, calling initialize_adapter if needed."""
        if not self._adapter_initialized or self.adapter is None or self.pseudonymizer is None:
            await self.initialize_adapter()
        # Check again after attempting initialization
        if self.adapter is None or self.pseudonymizer is None:
             raise RuntimeError("KG Adapter or Pseudonymizer failed to initialize and is required.")

    def _create_miz_oki_response(self, request_data: Dict, status: str, payload: Optional[Dict] = None, errors: Optional[List[Dict]] = None) -> Dict:
        """Helper to construct a standard MIZ OKI response."""
        return {
            "miz_oki_version": self.config.miz_oki_schema_version,
            "request_id": request_data.get("request_id"),
            "trace_id": request_data.get("trace_id"),
            "workflow_execution_id": request_data.get("workflow_execution_id"),
            "step_id": request_data.get("step_id"),
            "timestamp": datetime.now(datetime.timezone.utc).isoformat(),
            "source_component": "KnowledgeGraphToolService",
            "target_component": request_data.get("source_component"),
            "status": status,
            "payload": payload,
            "error_details": errors,
            "metadata": {} # Add processing time later if needed
        }

    # --- API Endpoint Handlers (Conceptual FastAPI Structure - Reworked Async) ---
    # These methods would be decorated with @app.post, @app.get etc. in a FastAPI app.
    # They accept the MIZ OKI request dict and return the MIZ OKI response dict.

    # @app.post("/entities", response_model=MizOkiResponse, status_code=201)
    async def add_entity_endpoint(self, request: Dict = Body(...)) -> Dict:
        """ API endpoint to add/update an entity, handling MIZ OKI payload and pseudonymization. """
        start_time = time.monotonic(); response_payload = {}; status = "error"; errors = []
        try:
            await self.ensure_adapter() # Ensure adapter and pseudonymizer are ready
            # TODO: Add permission check based on request.get("security_context")
            # Validate input payload structure (basic check, Pydantic model preferred)
            if not isinstance(request.get("payload"), dict) or "entity_data" not in request["payload"]:
                 raise ValueError("Invalid payload: 'payload.entity_data' is required.")

            entity_data = request["payload"]["entity_data"]
            source = request["payload"].get("source", "api") # Default source if not provided

            # Apply pseudonymization BEFORE sending to adapter
            processed_data = self.pseudonymizer.pseudonymize_dict(entity_data)

            # Call the adapter method
            result = await self.adapter.add_entity(processed_data, source)

            if result.get("success"):
                status = "success"; response_payload = result
            else:
                status = "failed"; errors.append({"code": "KG_ADD_FAILED", "message": result.get("error", "Unknown KG error"), "details": result.get("hints")})

        except (ValueError, TypeError) as ve:
            status = "bad_request"; errors.append({"code": "VALIDATION_ERROR", "message": str(ve)})
        except InitializationError as ie:
             status = "service_unavailable"; errors.append({"code": "INIT_ERROR", "message": str(ie)})
        except ConnectionError as ce:
             status = "service_unavailable"; errors.append({"code": "DB_CONNECTION_ERROR", "message": str(ce)})
        except Exception as e:
            status = "internal_error"; errors.append({"code": "INTERNAL_SERVER_ERROR", "message": str(e)}); self.logger.error(f"API Error adding entity: {e}", exc_info=True)

        response = self._create_miz_oki_response(request, status, response_payload if status == "success" else None, errors if errors else None)
        response["metadata"]["processing_duration_ms"] = (time.monotonic() - start_time) * 1000
        return response

    # @app.post("/entities/bulk", response_model=MizOkiResponse)
    async def add_entities_bulk_endpoint(self, request: Dict = Body(...)) -> Dict:
        """ API endpoint for bulk entity addition/update. """
        start_time = time.monotonic(); response_payload = {}; status = "error"; errors = []
        try:
            await self.ensure_adapter()
            # TODO: Permission check
            if not isinstance(request.get("payload"), dict) or "entities" not in request["payload"]:
                 raise ValueError("Invalid payload: 'payload.entities' (list) is required.")

            entities = request["payload"]["entities"]
            source = request["payload"].get("source", "bulk_api")
            if not isinstance(entities, list): raise ValueError("'payload.entities' must be a list.")

            # Pseudonymize each entity in the list
            processed_entities = [self.pseudonymizer.pseudonymize_dict(e) for e in entities if isinstance(e, dict)]
            invalid_items = len(entities) - len(processed_entities)
            if invalid_items > 0: logger.warning(f"{invalid_items} items in bulk entity request were not dictionaries and were skipped.")

            if not processed_entities: raise ValueError("No valid entity data provided in the list.")

            # Call adapter method
            result = await self.adapter.add_entities_bulk(processed_entities, source)

            # Result includes counts and specific failures from the adapter
            response_payload = result
            failed_count = result.get("failed", 0)
            if failed_count == 0: status = "success"
            elif failed_count < len(processed_entities): status = "partial_success"
            else: status = "failed"
            if failed_count > 0: errors = result.get("failures", []) # Pass adapter failures back

        except (ValueError, TypeError) as ve:
            status = "bad_request"; errors.append({"code": "VALIDATION_ERROR", "message": str(ve)})
        except InitializationError as ie:
             status = "service_unavailable"; errors.append({"code": "INIT_ERROR", "message": str(ie)})
        except ConnectionError as ce:
             status = "service_unavailable"; errors.append({"code": "DB_CONNECTION_ERROR", "message": str(ce)})
        except Exception as e:
            status = "internal_error"; errors.append({"code": "INTERNAL_SERVER_ERROR", "message": str(e)}); self.logger.error(f"API Error adding entities bulk: {e}", exc_info=True)

        response = self._create_miz_oki_response(request, status, response_payload, errors if errors else None)
        response["metadata"]["processing_duration_ms"] = (time.monotonic() - start_time) * 1000
        return response

    # --- Other Endpoints (Implement similarly, calling corresponding adapter methods) ---

    # @app.post("/relationships", response_model=MizOkiResponse, status_code=201)
    async def add_relationship_endpoint(self, request: Dict = Body(...)) -> Dict:
         start_time = time.monotonic(); status = "error"; errors = []; response_payload = None
         try:
             await self.ensure_adapter()
             # TODO: Permission check
             if not isinstance(request.get("payload"), dict) or "relationship_data" not in request["payload"]:
                  raise ValueError("Invalid payload: 'payload.relationship_data' is required.")
             rel_data = request["payload"]["relationship_data"]
             # Pseudonymize hints if they contain sensitive info (depends on hint structure)
             # processed_rel_data = self.pseudonymizer.pseudonymize_dict(rel_data) # Apply carefully based on expected hint content
             processed_rel_data = rel_data # Assuming hints don't need it for now
             success = await self.adapter.add_relationship(processed_rel_data)
             if success: status = "success"; response_payload = {"relationship_added": True}
             else: status = "failed"; errors.append({"code": "KG_REL_ADD_FAILED", "message": "Failed to add/update relationship."})
         except (ValueError, TypeError) as ve: status = "bad_request"; errors.append({"code": "VALIDATION_ERROR", "message": str(ve)})
         except InitializationError as ie: status = "service_unavailable"; errors.append({"code": "INIT_ERROR", "message": str(ie)})
         except ConnectionError as ce: status = "service_unavailable"; errors.append({"code": "DB_CONNECTION_ERROR", "message": str(ce)})
         except Exception as e: status = "internal_error"; errors.append({"code": "INTERNAL_SERVER_ERROR", "message": str(e)}); self.logger.error(f"API Error adding relationship: {e}", exc_info=True)
         response = self._create_miz_oki_response(request, status, response_payload, errors if errors else None)
         response["metadata"]["processing_duration_ms"] = (time.monotonic() - start_time) * 1000
         return response

    # @app.post("/relationships/bulk", response_model=MizOkiResponse)
    async def add_relationships_bulk_endpoint(self, request: Dict = Body(...)) -> Dict:
         start_time = time.monotonic(); status = "error"; errors = []; response_payload = None
         try:
             await self.ensure_adapter()
             # TODO: Permission check
             if not isinstance(request.get("payload"), dict) or "relationships" not in request["payload"]:
                  raise ValueError("Invalid payload: 'payload.relationships' (list) is required.")
             relationships = request["payload"]["relationships"]
             if not isinstance(relationships, list): raise ValueError("'payload.relationships' must be a list.")
             # processed_rels = [self.pseudonymizer.pseudonymize_dict(r) for r in relationships if isinstance(r, dict)] # Apply pseudonymization carefully
             processed_rels = [r for r in relationships if isinstance(r, dict)]
             if not processed_rels: raise ValueError("No valid relationship data provided.")
             result = await self.adapter.add_relationships_bulk(processed_rels)
             response_payload = result; failed_count = result.get("failed", 0)
             if failed_count == 0: status = "success"
             elif failed_count < len(processed_rels): status = "partial_success"
             else: status = "failed"
             if failed_count > 0: errors = result.get("failures", [])
         except (ValueError, TypeError) as ve: status = "bad_request"; errors.append({"code": "VALIDATION_ERROR", "message": str(ve)})
         except InitializationError as ie: status = "service_unavailable"; errors.append({"code": "INIT_ERROR", "message": str(ie)})
         except ConnectionError as ce: status = "service_unavailable"; errors.append({"code": "DB_CONNECTION_ERROR", "message": str(ce)})
         except Exception as e: status = "internal_error"; errors.append({"code": "INTERNAL_SERVER_ERROR", "message": str(e)}); self.logger.error(f"API Error adding relationships bulk: {e}", exc_info=True)
         response = self._create_miz_oki_response(request, status, response_payload, errors if errors else None)
         response["metadata"]["processing_duration_ms"] = (time.monotonic() - start_time) * 1000
         return response

    # @app.get("/entities/{mizId}", response_model=MizOkiResponse)
    async def get_entity_endpoint(self, request: Request, mizId: str) -> Dict: # Pass request for context
         start_time = time.monotonic(); status = "error"; errors = []; response_payload = None
         request_data = {"request_id": request.headers.get("X-Request-ID"), "trace_id": request.headers.get("X-Trace-ID")} # Example: Get IDs from headers
         try:
             await self.ensure_adapter()
             # TODO: Permission check
             entity_data = await self.adapter.get_entity(mizId)
             if entity_data:
                  # Apply pseudonymization if needed based on request context/role (optional)
                  # processed_entity_data = self.pseudonymizer.pseudonymize_dict(entity_data)
                  processed_entity_data = entity_data # Assume raw data return for now
                  status = "success"; response_payload = {"entity_data": processed_entity_data}
             else: status = "not_found"; errors.append({"code": "ENTITY_NOT_FOUND", "message": f"Entity with mizId '{mizId}' not found."})
         except InitializationError as ie: status = "service_unavailable"; errors.append({"code": "INIT_ERROR", "message": str(ie)})
         except ConnectionError as ce: status = "service_unavailable"; errors.append({"code": "DB_CONNECTION_ERROR", "message": str(ce)})
         except Exception as e: status = "internal_error"; errors.append({"code": "INTERNAL_SERVER_ERROR", "message": str(e)}); self.logger.error(f"API Error getting entity {mizId}: {e}", exc_info=True)
         response = self._create_miz_oki_response(request_data, status, response_payload, errors if errors else None)
         response["metadata"]["processing_duration_ms"] = (time.monotonic() - start_time) * 1000
         return response

    # @app.post("/entities/find", response_model=MizOkiResponse)
    async def find_entity_endpoint(self, request: Dict = Body(...)) -> Dict:
         start_time = time.monotonic(); status = "error"; errors = []; response_payload = None
         try:
             await self.ensure_adapter()
             # TODO: Permission check
             if not isinstance(request.get("payload"), dict) or "hints" not in request["payload"]:
                  raise ValueError("Invalid payload: 'payload.hints' (dict) is required.")
             hints = request["payload"]["hints"]
             mizId = await self.adapter.find_entity_by_hints(hints)
             if mizId: status = "success"; response_payload = {"mizId": mizId}
             else: status = "not_found"; errors.append({"code": "ENTITY_NOT_FOUND", "message": f"Entity not found for hints: {hints}"})
         except (ValueError, TypeError) as ve: status = "bad_request"; errors.append({"code": "VALIDATION_ERROR", "message": str(ve)})
         except InitializationError as ie: status = "service_unavailable"; errors.append({"code": "INIT_ERROR", "message": str(ie)})
         except ConnectionError as ce: status = "service_unavailable"; errors.append({"code": "DB_CONNECTION_ERROR", "message": str(ce)})
         except Exception as e: status = "internal_error"; errors.append({"code": "INTERNAL_SERVER_ERROR", "message": str(e)}); self.logger.error(f"API Error finding entity by hints: {e}", exc_info=True)
         response = self._create_miz_oki_response(request, status, response_payload, errors if errors else None)
         response["metadata"]["processing_duration_ms"] = (time.monotonic() - start_time) * 1000
         return response

    # @app.get("/entities/{mizId}/neighbors", response_model=MizOkiResponse)
    async def get_neighbors_endpoint(self, request: Request, mizId: str, relationship_type: Optional[str] = None, direction: str = "both", limit: int = 250) -> Dict:
         start_time = time.monotonic(); status = "error"; errors = []; response_payload = None
         request_data = {"request_id": request.headers.get("X-Request-ID"), "trace_id": request.headers.get("X-Trace-ID")}
         try:
             await self.ensure_adapter()
             # TODO: Permission check
             neighbors = await self.adapter.get_neighbors(mizId, relationship_type, direction, limit)
             # Apply pseudonymization to neighborProps if needed
             # processed_neighbors = [...]
             status = "success"; response_payload = {"neighbors": neighbors}
         except InitializationError as ie: status = "service_unavailable"; errors.append({"code": "INIT_ERROR", "message": str(ie)})
         except ConnectionError as ce: status = "service_unavailable"; errors.append({"code": "DB_CONNECTION_ERROR", "message": str(ce)})
         except Exception as e: status = "internal_error"; errors.append({"code": "INTERNAL_SERVER_ERROR", "message": str(e)}); self.logger.error(f"API Error getting neighbors for {mizId}: {e}", exc_info=True)
         response = self._create_miz_oki_response(request_data, status, response_payload, errors if errors else None)
         response["metadata"]["processing_duration_ms"] = (time.monotonic() - start_time) * 1000
         return response

    # @app.post("/paths/find", response_model=MizOkiResponse)
    async def find_path_endpoint(self, request: Dict = Body(...)) -> Dict:
         start_time = time.monotonic(); status = "error"; errors = []; response_payload = None
         try:
             await self.ensure_adapter()
             # TODO: Permission check
             payload = request.get("payload", {})
             if not all(k in payload for k in ["start_node_hints", "end_node_hints"]):
                  raise ValueError("Invalid payload: 'start_node_hints' and 'end_node_hints' are required.")
             path_data = await self.adapter.find_path(
                 payload["start_node_hints"], payload["end_node_hints"],
                 payload.get("relationship_types"), payload.get("max_depth", 5)
             )
             if path_data is not None: # Could be empty list [] for path length 0
                  status = "success"; response_payload = {"path": path_data}
             else: status = "not_found"; errors.append({"code": "PATH_NOT_FOUND", "message": "No path found between the specified nodes."})
         except (ValueError, TypeError) as ve: status = "bad_request"; errors.append({"code": "VALIDATION_ERROR", "message": str(ve)})
         except InitializationError as ie: status = "service_unavailable"; errors.append({"code": "INIT_ERROR", "message": str(ie)})
         except ConnectionError as ce: status = "service_unavailable"; errors.append({"code": "DB_CONNECTION_ERROR", "message": str(ce)})
         except Exception as e: status = "internal_error"; errors.append({"code": "INTERNAL_SERVER_ERROR", "message": str(e)}); self.logger.error(f"API Error finding path: {e}", exc_info=True)
         response = self._create_miz_oki_response(request, status, response_payload, errors if errors else None)
         response["metadata"]["processing_duration_ms"] = (time.monotonic() - start_time) * 1000
         return response

    # @app.get("/schema", response_model=MizOkiResponse)
    async def get_schema_endpoint(self, request: Request) -> Dict:
         start_time = time.monotonic(); status = "error"; errors = []; response_payload = None
         request_data = {"request_id": request.headers.get("X-Request-ID"), "trace_id": request.headers.get("X-Trace-ID")}
         try:
             await self.ensure_adapter()
             # TODO: Permission check
             schema_data = await self.adapter.get_schema()
             status = "success"; response_payload = {"schema": schema_data}
         except InitializationError as ie: status = "service_unavailable"; errors.append({"code": "INIT_ERROR", "message": str(ie)})
         except ConnectionError as ce: status = "service_unavailable"; errors.append({"code": "DB_CONNECTION_ERROR", "message": str(ce)})
         except Exception as e: status = "internal_error"; errors.append({"code": "INTERNAL_SERVER_ERROR", "message": str(e)}); self.logger.error(f"API Error getting schema: {e}", exc_info=True)
         response = self._create_miz_oki_response(request_data, status, response_payload, errors if errors else None)
         response["metadata"]["processing_duration_ms"] = (time.monotonic() - start_time) * 1000
         return response

    # @app.get("/stats", response_model=MizOkiResponse)
    async def get_stats_endpoint(self, request: Request) -> Dict:
         start_time = time.monotonic(); status = "error"; errors = []; response_payload = None
         request_data = {"request_id": request.headers.get("X-Request-ID"), "trace_id": request.headers.get("X-Trace-ID")}
         try:
             await self.ensure_adapter()
             # TODO: Permission check
             stats_data = await self.adapter.get_stats()
             status = "success"; response_payload = {"stats": stats_data}
         except InitializationError as ie: status = "service_unavailable"; errors.append({"code": "INIT_ERROR", "message": str(ie)})
         except ConnectionError as ce: status = "service_unavailable"; errors.append({"code": "DB_CONNECTION_ERROR", "message": str(ce)})
         except Exception as e: status = "internal_error"; errors.append({"code": "INTERNAL_SERVER_ERROR", "message": str(e)}); self.logger.error(f"API Error getting stats: {e}", exc_info=True)
         response = self._create_miz_oki_response(request_data, status, response_payload, errors if errors else None)
         response["metadata"]["processing_duration_ms"] = (time.monotonic() - start_time) * 1000
         return response

    # @app.post("/vectors", response_model=MizOkiResponse, status_code=201)
    async def upsert_vector_endpoint(self, request: Dict = Body(...)) -> Dict:
         start_time = time.monotonic(); status = "error"; errors = []; response_payload = None
         try:
             await self.ensure_adapter()
             # TODO: Permission check
             payload = request.get("payload", {})
             # Basic validation (Pydantic model preferred)
             if not all(k in payload for k in ["vector_id", "vector", "metadata"]):
                  raise ValueError("Invalid payload: 'vector_id', 'vector', and 'metadata' are required.")
             success = await self.adapter.upsert_vector(
                 payload["vector_id"], payload["vector"], payload["metadata"], payload.get("namespace")
             )
             if success: status = "success"; response_payload = {"upserted": True, "vector_id": payload["vector_id"]}
             else: status = "failed"; errors.append({"code": "VECTOR_UPSERT_FAILED", "message": "Failed to upsert vector."})
         except (ValueError, TypeError) as ve: status = "bad_request"; errors.append({"code": "VALIDATION_ERROR", "message": str(ve)})
         except InitializationError as ie: status = "service_unavailable"; errors.append({"code": "INIT_ERROR", "message": str(ie)})
         except ConnectionError as ce: status = "service_unavailable"; errors.append({"code": "DB_CONNECTION_ERROR", "message": str(ce)})
         except Exception as e: status = "internal_error"; errors.append({"code": "INTERNAL_SERVER_ERROR", "message": str(e)}); self.logger.error(f"API Error upserting vector: {e}", exc_info=True)
         response = self._create_miz_oki_response(request, status, response_payload, errors if errors else None)
         response["metadata"]["processing_duration_ms"] = (time.monotonic() - start_time) * 1000
         return response

    # @app.post("/vectors/search", response_model=MizOkiResponse)
    async def search_vector_endpoint(self, request: Dict = Body(...)) -> Dict:
         start_time = time.monotonic(); status = "error"; errors = []; response_payload = None
         try:
             await self.ensure_adapter()
             # TODO: Permission check
             payload = request.get("payload", {})
             if not all(k in payload for k in ["query_vector", "k"]):
                  raise ValueError("Invalid payload: 'query_vector' and 'k' are required.")
             results = await self.adapter.search_vector_index(
                 payload["query_vector"], payload["k"],
                 payload.get("namespace"), payload.get("filter_dict"),
                 # payload.get("index_name") # Allow overriding index if needed
             )
             status = "success"; response_payload = {"results": results}
         except (ValueError, TypeError) as ve: status = "bad_request"; errors.append({"code": "VALIDATION_ERROR", "message": str(ve)})
         except InitializationError as ie: status = "service_unavailable"; errors.append({"code": "INIT_ERROR", "message": str(ie)})
         except ConnectionError as ce: status = "service_unavailable"; errors.append({"code": "DB_CONNECTION_ERROR", "message": str(ce)})
         except Exception as e: status = "internal_error"; errors.append({"code": "INTERNAL_SERVER_ERROR", "message": str(e)}); self.logger.error(f"API Error searching vectors: {e}", exc_info=True)
         response = self._create_miz_oki_response(request, status, response_payload, errors if errors else None)
         response["metadata"]["processing_duration_ms"] = (time.monotonic() - start_time) * 1000
         return response

    # @app.post("/xai/decisions", response_model=MizOkiResponse, status_code=201)
    async def save_decision_endpoint(self, request: Dict = Body(...)) -> Dict:
         start_time = time.monotonic(); status = "error"; errors = []; response_payload = None
         try:
             await self.ensure_adapter()
             # TODO: Permission check
             if not isinstance(request.get("payload"), dict) or "record" not in request["payload"]:
                  raise ValueError("Invalid payload: 'payload.record' (dict) is required.")
             record = request["payload"]["record"]
             success = await self.adapter.save_decision_record(record)
             if success: status = "success"; response_payload = {"decision_id": record.get("decision_id"), "saved": True}
             else: status = "failed"; errors.append({"code": "XAI_SAVE_FAILED", "message": "Failed to save decision record."})
         except (ValueError, TypeError) as ve: status = "bad_request"; errors.append({"code": "VALIDATION_ERROR", "message": str(ve)})
         except InitializationError as ie: status = "service_unavailable"; errors.append({"code": "INIT_ERROR", "message": str(ie)})
         except ConnectionError as ce: status = "service_unavailable"; errors.append({"code": "DB_CONNECTION_ERROR", "message": str(ce)})
         except Exception as e: status = "internal_error"; errors.append({"code": "INTERNAL_SERVER_ERROR", "message": str(e)}); self.logger.error(f"API Error saving decision record: {e}", exc_info=True)
         response = self._create_miz_oki_response(request, status, response_payload, errors if errors else None)
         response["metadata"]["processing_duration_ms"] = (time.monotonic() - start_time) * 1000
         return response

    # @app.get("/xai/decisions/{decision_id}", response_model=MizOkiResponse)
    async def retrieve_decision_endpoint(self, request: Request, decision_id: str) -> Dict:
         start_time = time.monotonic(); status = "error"; errors = []; response_payload = None
         request_data = {"request_id": request.headers.get("X-Request-ID"), "trace_id": request.headers.get("X-Trace-ID")}
         try:
             await self.ensure_adapter()
             # TODO: Permission check
             record = await self.adapter.retrieve_decision_record(decision_id)
             if record: status = "success"; response_payload = {"decision_record": record}
             else: status = "not_found"; errors.append({"code": "XAI_LOG_NOT_FOUND", "message": f"Decision record '{decision_id}' not found."})
         except InitializationError as ie: status = "service_unavailable"; errors.append({"code": "INIT_ERROR", "message": str(ie)})
         except ConnectionError as ce: status = "service_unavailable"; errors.append({"code": "DB_CONNECTION_ERROR", "message": str(ce)})
         except Exception as e: status = "internal_error"; errors.append({"code": "INTERNAL_SERVER_ERROR", "message": str(e)}); self.logger.error(f"API Error retrieving decision record {decision_id}: {e}", exc_info=True)
         response = self._create_miz_oki_response(request_data, status, response_payload, errors if errors else None)
         response["metadata"]["processing_duration_ms"] = (time.monotonic() - start_time) * 1000
         return response

    # @app.get("/health")
    async def health_check_endpoint(self, request: Request) -> Dict:
        """Basic health check endpoint."""
        # Simple check, can be expanded to check DB connection status etc.
        status = "ok" if self._adapter_initialized and self.adapter else "error"
        return {"status": status, "timestamp": datetime.now(datetime.timezone.utc).isoformat()}


# --- Initialization (Conceptual - Service deployed separately) ---
# This would happen in the main application entry point (e.g., main.py for FastAPI)

# kg_tool_service_instance: Optional[KnowledgeGraphToolService] = None

# async def startup_kg_service():
#     global kg_tool_service_instance
#     if _config_obj:
#         try:
#             kg_tool_service_instance = KnowledgeGraphToolService(_config_obj)
#             await kg_tool_service_instance.initialize_adapter() # Initialize on startup
#             logger.info("KnowledgeGraphToolService initialized successfully.")
#         except Exception as e:
#             logger.critical(f"Failed to initialize KnowledgeGraphToolService: {e}", exc_info=True)
#             # Optionally prevent app startup if KG is critical
#     else:
#         logger.critical("Cannot initialize KnowledgeGraphToolService: CONFIG_OBJ not loaded.")

# async def shutdown_kg_service():
#      if kg_tool_service_instance:
#          await kg_tool_service_instance.close_adapter()
#          logger.info("KnowledgeGraphToolService shut down.")

# Example FastAPI app setup (conceptual)
# if FASTAPI_AVAILABLE:
#     app = FastAPI(title="MIZ OKI 3.0 Knowledge Graph Tool Service")
#     app.add_event_handler("startup", startup_kg_service)
#     app.add_event_handler("shutdown", shutdown_kg_service)
#
#     # Dependency function for FastAPI endpoints
#     async def get_kg_service() -> KnowledgeGraphToolService:
#         if kg_tool_service_instance is None or not kg_tool_service_instance._adapter_initialized:
#              raise HTTPException(status_code=status.HTTP_503_SERVICE_UNAVAILABLE, detail="KG Service not ready")
#         return kg_tool_service_instance
#
#     # Example endpoint definition using the service
#     @app.post("/entities", response_model=MizOkiResponse, status_code=status.HTTP_201_CREATED)
#     async def add_entity(request: MizOkiRequest, service: KnowledgeGraphToolService = Depends(get_kg_service)):
#         return await service.add_entity_endpoint(request.dict()) # Pass MIZ OKI dict
#
#     @app.get("/health")
#     async def health_check(service: KnowledgeGraphToolService = Depends(get_kg_service)):
#          return await service.health_check_endpoint(None) # Pass dummy request or modify endpoint signature

print("\n--- MIZ 3.0 KG Layer (Cell 3 - Reworked) ---")
print("Neo4jAsyncAdapter includes Vector DB routing & structured XAI storage.")
print("Conceptual FastAPI service layer added, handling MIZ OKI payloads.")
print("Requires full implementation of API endpoints and Vector DB client logic.")
print("Uses async operations for DB/API calls.")
print("-------------------------------------------------------------")


SyntaxError: invalid syntax (<ipython-input-4-361a23d22eb2>, line 1)

In [5]:
# Cell 4: Foundational Layer Implementation (Reworked)
# Status: AKA logic structured for ADK/Tool deployment. Uses real dependencies/proxies.
#         B.O.S.S/Experimentation loops trigger real clients (Vertex Workflows, Pub/Sub).
#         Placeholders remain for specific KG queries, external data fetch details, MoE API client.
#         State management (monitors, experiments) needs persistent backing in a real deployment.

import os
import numpy as np
import pandas as pd
import datetime
import json
import logging
import time
import random
import asyncio
from typing import Dict, Any, Optional, List, Union, Tuple, Callable
import uuid
from collections import deque, defaultdict, Counter # Added Counter
import aiohttp # For MoE Registry API call

# --- Assume Real Tool/Client Dependencies are Injected/Available ---
# These would be provided during the initialization of the service/agent running this logic.
try:
    # Use CONFIG_OBJ loaded in Cell 1
    if 'CONFIG_OBJ' not in globals() or not CONFIG_OBJ: raise NameError("CONFIG_OBJ not found or is None")
    if not isinstance(CONFIG_OBJ, EnhancedConfig): raise NameError("CONFIG_OBJ is not an EnhancedConfig instance")

    # Proxies for other MIZ OKI Tool APIs (representing deployed services)
    if 'kg_tool_service_instance' not in globals(): raise NameError("kg_tool_service_instance proxy not found") # Cell 3 Proxy
    if 'foundation_model_client' not in globals(): raise NameError("foundation_model_client proxy not found") # Cell 18 Proxy
    if 'kd_tool' not in globals(): raise NameError("kd_tool (KnowledgeDistillationTool instance/proxy) not found") # Cell 8 Proxy

    # Real/Mock Clients for GCP Services
    if '_workflow_executions_client' not in globals(): raise NameError("_workflow_executions_client not found") # Cell 16 Client
    if '_pubsub_client' not in globals(): raise NameError("_pubsub_client not found") # Cell 8 Client

    _config_obj = CONFIG_OBJ
    _kg_tool_proxy = kg_tool_service_instance
    _fm_client_proxy = foundation_model_client
    _kd_tool_proxy = kd_tool
    _workflow_client_proxy = _workflow_executions_client # Use real/mock client from Cell 16
    _pubsub_client_proxy = _pubsub_client # Use real/mock client from Cell 8
    _real_dependencies = True
    logger.debug("Using real/conceptual dependencies in Cell 4 (Reworked).")

except NameError as e:
    logger.warning(f"Dependency Error in Cell 4 ({e}). Using Mocks/Placeholders.")
    _real_dependencies = False
    # --- Mock/Placeholder Setup ---
    class MockKGTool:
        async def execute_query(self, request): await asyncio.sleep(0.01); return {"status": "success", "payload": {"results": [{'description': 'gap desc', 'domain': 'mock_domain'}]}}
        async def add_entities_bulk_endpoint(self, request): await asyncio.sleep(0.01); return {"status": "success", "payload": {"new": 1, "updated": 0, "failed": 0}}
    class MockFMClientTool:
        async def summarize(self, input_data): await asyncio.sleep(0.01); return {"status": "success", "payload": {"summary": "Mock summary."}}
        async def extract_entities(self, input_data): await asyncio.sleep(0.01); return {"status": "success", "payload": {"entities": [{"entity": "mock"}]}}
    class MockKnowledgeDistillationTool:
        async def distill_knowledge(self, request): await asyncio.sleep(0.1); return {"status": "success", "payload": {"teacher_output_path": "gs://mock/kd_output.jsonl"}}
    class MockVertexWorkflowClient:
        async def start_workflow(self, project, location, workflow_id, miz_oki_input): return f"projects/{project}/locations/{location}/workflows/{workflow_id}/executions/exec_{uuid.uuid4().hex[:8]}" # Return full name
        async def get_execution(self, request): return MagicMock(state=ExecutionState.SUCCEEDED) # Needs refinement if state logic tested
    class MockPubSubClient:
        async def publish(self, topic, data_bytes): return f"msg_{uuid.uuid4().hex[:8]}"

    # Define minimal config if needed
    if 'CONFIG_OBJ' not in globals() or not CONFIG_OBJ:
        from dataclasses import dataclass, field
        @dataclass class MockGcpConfig: project_id:Optional[str]="mock-proj"; region:str="mock-region"; gcs_bucket_name:Optional[str]="mock-bucket"
        @dataclass class MockAdkConfig: tool_registry_location: Optional[str] = 'gs://mock/tools'
        @dataclass class MockFmDefaults: llama4_maverick: str = "mock-llama"; llama4_scout: str = "mock-scout"
        @dataclass class MockFmConfig: defaults: MockFmDefaults = field(default_factory=MockFmDefaults)
        @dataclass class MockVertexAIConfig: experiment_execution_workflow_id: str = "mock-exp-exec"; experiment_analysis_workflow_id: str = "mock-exp-analysis"
        @dataclass class MockConfig: gcp: MockGcpConfig = field(default_factory=MockGcpConfig); mlops_trigger_topic:str="mock-topic"; mlops_rl_train_topic:str="mock-rl"; service_endpoints: ServiceEndpointsConfig = field(default_factory=ServiceEndpointsConfig); adk: MockAdkConfig = field(default_factory=MockAdkConfig); foundation_models: MockFmConfig = field(default_factory=MockFmConfig); vertex_ai: MockVertexAIConfig = field(default_factory=MockVertexAIConfig); miz_oki_schema_version: str = "3.0"; def get(self, key, default=None): parts=key.split('.'); val=self; try: [val := getattr(val, p) for p in parts]; return val; except: return default
        _config_obj = MockConfig()

    _kg_tool_proxy = MockKGTool()
    _fm_client_proxy = MockFMClientTool()
    _kd_tool_proxy = MockKnowledgeDistillationTool()
    _workflow_client_proxy = MockVertexWorkflowClient()
    _pubsub_client_proxy = MockPubSubClient()
    # --- End Mock/Placeholder Setup ---

logger = logging.getLogger('MIZ-OKI.FoundationalLayer')

# --- Mixture of Experts Registry Manager (Metadata Focus - Reworked Async) ---
class MixtureOfExpertsRegistryManager:
    """ Manages expert model metadata via API (preferred) or static source. """
    def __init__(self, config: EnhancedConfig):
        self.config = config
        self.registry_api_endpoint = config.service_endpoints.moe_registry_api_endpoint
        # Fallback to ADK tool registry location if API not set (less ideal for dynamic MoE)
        self.static_registry_location = config.adk.tool_registry_location if not self.registry_api_endpoint else None
        self.registry_source = self.registry_api_endpoint or self.static_registry_location
        self.expert_registry: Dict[str, Dict] = {} # In-memory cache
        self._cache_ttl_seconds = 300 # 5 minutes
        self._last_cache_update_time = 0
        self._lock = asyncio.Lock() # Prevent race conditions during refresh
        self.session: Optional[aiohttp.ClientSession] = None
        self.logger = logging.getLogger('MIZ-OKI.MoERegistryManager')
        if not self.registry_source:
            self.logger.warning("No MoE Registry API endpoint or static location configured! Expert routing will be disabled.")
        self.logger.info(f"MoE Registry Manager initialized. Source: {self.registry_source}")

    async def initialize(self):
        """Initialize async session."""
        self.session = aiohttp.ClientSession()
        await self._load_registry(force_refresh=True) # Initial load

    async def cleanup(self):
        """Close async session."""
        if self.session: await self.session.close()

    async def _load_registry(self, force_refresh: bool = False) -> None:
        """ Loads or refreshes the registry cache from the source (API preferred) asynchronously. """
        now = time.monotonic()
        async with self._lock:
            if not force_refresh and (now - self._last_cache_update_time < self._cache_ttl_seconds):
                return # Cache is still valid

            self.logger.info(f"Refreshing MoE registry cache from {self.registry_source}...")
            new_registry = {}
            try:
                if self.registry_api_endpoint:
                    if not self.session: self.session = aiohttp.ClientSession() # Ensure session exists
                    api_url = f"{self.registry_api_endpoint.rstrip('/')}/experts" # Assuming a '/experts' endpoint
                    # TODO: Add authentication headers if the API requires it (e.g., API key, OIDC token)
                    headers = {"Accept": "application/json"}
                    async with self.session.get(api_url, headers=headers, timeout=10) as response:
                        response.raise_for_status() # Raise exception for bad status codes
                        experts_list = await response.json()
                        if isinstance(experts_list, list):
                            new_registry = {expert.get('expert_id'): expert for expert in experts_list if expert.get('expert_id')}
                        else:
                            self.logger.error(f"Invalid format from MoE Registry API: Expected list, got {type(experts_list)}")
                elif self.static_registry_location and self.static_registry_location.startswith("gs://"):
                    # --- Placeholder: Implement GCS load using aio-gcsfs or sync thread ---
                    # Example using sync thread:
                    # def _load_from_gcs_sync():
                    #     client = storage.Client(project=self.config.gcp.project_id)
                    #     bucket_name = self.static_registry_location.split('/')[2]
                    #     blob_name = '/'.join(self.static_registry_location.split('/')[3:])
                    #     blob = client.bucket(bucket_name).blob(blob_name)
                    #     content = blob.download_as_text()
                    #     return json.loads(content)
                    # registry_data = await asyncio.to_thread(_load_from_gcs_sync)
                    # if isinstance(registry_data, dict): new_registry = registry_data # Assuming GCS file stores the dict directly
                    self.logger.warning(f"MoE Registry loading from GCS ({self.static_registry_location}) not implemented - using empty cache.") # Placeholder
                else:
                    self.logger.warning(f"Invalid or missing MoE registry source: {self.registry_source}")

                self.expert_registry = new_registry
                self._last_cache_update_time = time.monotonic()
                self.logger.info(f"MoE registry cache refreshed. Found {len(self.expert_registry)} experts.")
            except aiohttp.ClientError as http_err:
                self.logger.error(f"HTTP error fetching MoE registry from API: {http_err}")
                self.expert_registry = {} # Clear cache on error
            except json.JSONDecodeError as json_err:
                 self.logger.error(f"Failed to parse JSON from MoE registry source: {json_err}")
                 self.expert_registry = {}
            except Exception as e:
                self.logger.error(f"Failed to refresh MoE registry cache: {e}", exc_info=True)
                self.expert_registry = {} # Clear cache on error

    async def get_expert_details(self, expert_id: str) -> Optional[Dict]:
        """Gets details for a specific expert from the cache."""
        await self._load_registry() # Ensure cache is reasonably fresh
        details = self.expert_registry.get(expert_id)
        if not details:
            self.logger.warning(f"Expert '{expert_id}' not found in registry cache.")
            # Optionally trigger a forced refresh if not found
            # await self._load_registry(force_refresh=True)
            # details = self.expert_registry.get(expert_id)
        return details

    async def find_expert_for_task(self, task_type: str, domain: Optional[str] = None, context: Optional[Dict] = None) -> Optional[str]:
        """Finds the best expert ID for a given task, domain, and context."""
        await self._load_registry() # Ensure cache is reasonably fresh
        if not self.expert_registry:
            self.logger.warning("MoE registry is empty. Cannot find expert.")
            return None

        candidates = []
        for expert_id, details in self.expert_registry.items():
            # Basic filtering (active, task type match)
            if details.get('status') != 'active': continue
            if details.get('task_type') != task_type: continue

            # Domain scoring (exact match = 1.0, partial/no match = lower score)
            domain_score = 1.0 if not domain or details.get('domain') == domain else 0.5 # Simple scoring

            # Performance scoring (use a primary metric like accuracy or F1)
            # Default to 0.5 if metrics are missing
            perf_score = details.get('evaluation_metrics', {}).get('accuracy', 0.5)

            # Contextual scoring (Placeholder - could involve checking tags, input/output types, etc.)
            context_score = 1.0 # Default if no context matching implemented

            # Combine scores (example: weighted average)
            final_score = (0.4 * domain_score) + (0.4 * perf_score) + (0.2 * context_score)
            candidates.append((expert_id, final_score))

        if not candidates:
            self.logger.warning(f"No suitable 'active' expert found for task '{task_type}' / domain '{domain}'.")
            return None

        # Select the best candidate (highest score)
        candidates.sort(key=lambda x: x[1], reverse=True)
        best_expert_id = candidates[0][0]
        self.logger.info(f"Routed task '{task_type}'/{domain or '*'} to expert '{best_expert_id}' (Score: {candidates[0][1]:.2f}).")
        return best_expert_id

# --- Conceptual ADK Agent Base ---
class AdkAgentBase: # Placeholder Base Class for structure
    """Conceptual base for agents intended to run within ADK/Vertex Agent Engine."""
    def __init__(self, agent_id, config, tools=None):
        self.agent_id = agent_id
        self.config = config
        # Tools would be injected proxies to other services/APIs
        self.tools = tools or {}
        # State management needs integration with the chosen persistence layer (e.g., Firestore, Workflow state)
        self.state: Dict[str, Any] = {}
        self.logger = logging.getLogger(f'MIZ-OKI.AdkAgent.{agent_id}')
        self.logger.info(f"ADK Agent '{agent_id}' logic initialized.")

    async def setup(self, initial_state: Optional[Dict] = None):
        """Load initial state. In real ADK, this might be handled by the framework."""
        self.state = initial_state or {}
        # TODO: Implement loading state from persistent store (e.g., Firestore, DB) if needed
        self.logger.info(f"Agent {self.agent_id} setup complete.")

    async def invoke(self, input_data: Dict, context: Dict) -> Dict:
        """Core agent logic. Expects MIZ OKI input, returns MIZ OKI output."""
        raise NotImplementedError # Subclasses must implement this

    async def save_state(self):
        """Persist agent state. In real ADK, this might be handled by the framework."""
        # TODO: Implement saving state to persistent store
        logger.warning(f"Agent {self.agent_id}: save_state not implemented - state is ephemeral.")
        pass

# --- Autonomous Knowledge Agent (Reworked Logic for ADK/Tool) ---
class AutonomousKnowledgeAgentLogic(AdkAgentBase):
    """ Implements AKA logic. Runs B.O.S.S. loop & experimentation asynchronously. Designed as a deployable Tool/Service. """
    def __init__(self, agent_id: str, config: EnhancedConfig, kg_tool_proxy: Any, fm_client_proxy: Any, kd_tool_proxy: Any,
                 moe_registry: MixtureOfExpertsRegistryManager, workflow_client_proxy: Any, pubsub_client_proxy: Any):
        # Inject proxies/clients for dependencies
        tools = {"kg": kg_tool_proxy, "fm": fm_client_proxy, "kd": kd_tool_proxy}
        super().__init__(agent_id, config, tools=tools)
        self.moe_registry = moe_registry
        self.workflow_client = workflow_client_proxy # Use the injected client proxy
        self.pubsub_client = pubsub_client_proxy     # Use the injected client proxy
        self.project = config.gcp.project_id
        self.location = config.gcp.region
        self.mlops_trigger_topic_name = config.mlops_trigger_topic
        self.mlops_rl_train_topic_name = config.mlops_rl_train_topic
        # State attributes - NEED PERSISTENCE
        self.discovery_monitors: Dict[str, Dict] = {}
        self.experiments: Dict[str, Dict] = {}
        # In-memory log, consider persistent logging via Cloud Logging
        self.agent_history = deque(maxlen=500)

    async def setup(self, initial_state: Optional[Dict] = None):
        """ Load monitors, experiments state from persistent store or initial config. """
        await super().setup(initial_state)
        # --- TODO: Implement loading state from persistent store ---
        # Example: Load from Firestore or state passed via workflow input
        self.discovery_monitors = self.state.get("discovery_monitors", {})
        self.experiments = self.state.get("experiments", {})
        # --- End TODO ---
        # Add a default monitor if none exist (for demonstration)
        if not self.discovery_monitors:
            self.add_discovery_monitor("default_news_monitor", "api", "market_news_api", 6, ['summarize', 'extract_entities'])
        self.logger.info("AutonomousKnowledgeAgent setup complete.")

    def add_discovery_monitor(self, monitor_id, source_type, query_or_key, frequency_hours, processing_pipeline):
        """ Adds/Updates a discovery monitor config. State needs persistence. """
        self.discovery_monitors[monitor_id] = {
            "source_type": source_type,
            "query_or_key": query_or_key,
            "frequency_hours": frequency_hours,
            "processing_pipeline": processing_pipeline,
            "last_checked": None,
            "status": "active"
        }
        self.state["discovery_monitors"] = self.discovery_monitors # Update conceptual state
        # TODO: Call self.save_state() or rely on ADK framework persistence
        self.logger.info(f"Added/Updated discovery monitor: {monitor_id}")

    async def invoke(self, input_data: Dict, context: Optional[Dict]=None) -> Dict:
        """
        Handles tasks based on MIZ OKI input payload. Returns MIZ OKI response.
        This is the main entry point when called as a Tool/Service.
        """
        # --- MIZ OKI Payload Parsing ---
        miz_oki_version = input_data.get("miz_oki_version", "unknown")
        request_id = input_data.get("request_id", f"req_aka_{uuid.uuid4().hex[:8]}")
        trace_id = input_data.get("trace_id", f"trace_aka_{uuid.uuid4().hex[:8]}")
        workflow_execution_id = input_data.get("workflow_execution_id") # Passed by Vertex Workflow
        step_id = input_data.get("step_id") # Passed by Vertex Workflow
        source_component = input_data.get("source_component")
        payload = input_data.get("payload", {})
        task_type = payload.get("task_type", "default_cycle") # e.g., "run_discovery_cycle", "run_boss_cycle"

        self.logger.info(f"AKA invoke started. Task: '{task_type}', Trace: {trace_id}, Request: {request_id}")

        # --- Prepare MIZ OKI Response Structure ---
        response = {
            "miz_oki_version": self.config.miz_oki_schema_version,
            "request_id": request_id, "trace_id": trace_id,
            "workflow_execution_id": workflow_execution_id, "step_id": step_id,
            "timestamp": datetime.now(datetime.timezone.utc).isoformat(),
            "source_component": self.agent_id, # This agent's ID
            "target_component": source_component, # Respond to caller
            "status": "unknown", # Will be updated
            "payload": {"results": None},
            "error_details": None,
            "metadata": {}
        }
        start_time = time.monotonic()
        results = None; errors = []

        try:
            # --- Dependency Check ---
            if not all([self.tools.get("kg"), self.tools.get("fm"), self.tools.get("kd"),
                        self.moe_registry, self.workflow_client, self.pubsub_client]):
                raise InitializationError("AKA missing critical dependencies (tools or clients).")

            # --- Task Routing ---
            if task_type == "run_discovery_cycle":
                results = await self.run_discovery_cycle(trace_id)
            elif task_type == "run_boss_cycle":
                results = await self.run_boss_cycle(trace_id)
            elif task_type == "run_experimentation_cycle":
                results = await self.run_experimentation_cycle(trace_id)
            elif task_type == "default_cycle":
                 # Example: Run discovery and B.O.S.S. sequentially
                 discovery_results = await self.run_discovery_cycle(trace_id)
                 boss_results = await self.run_boss_cycle(trace_id)
                 # Experimentation might be triggered separately or based on B.O.S.S. outcomes
                 results = {"discovery_summary": discovery_results, "boss_summary": boss_results}
            else:
                self.logger.warning(f"Unsupported task type for AKA: {task_type}")
                response["status"] = "bad_request"
                errors.append({"code": "UNSUPPORTED_TASK", "message": f"Task type '{task_type}' not supported by {self.agent_id}."})

            # --- Update Response Status ---
            if response["status"] == "unknown": # If no error set yet
                 response["status"] = "success"
                 response["payload"]["results"] = results

        except InitializationError as init_e:
             self.logger.critical(f"AKA Initialization Error during invoke: {init_e}")
             response["status"] = "service_unavailable"; errors.append({"code": "INIT_ERROR", "message": str(init_e)})
        except Exception as e:
            self.logger.error(f"Error during AKA invoke for task '{task_type}' (Trace: {trace_id}): {e}", exc_info=True)
            response["status"] = "internal_error"; errors.append({"code": "INTERNAL_ERROR", "message": str(e)})

        # --- Finalize Response ---
        # TODO: Call self.save_state() or rely on ADK framework persistence
        if errors: response["error_details"] = errors
        response["metadata"]["processing_duration_ms"] = (time.monotonic() - start_time) * 1000
        self.logger.info(f"AKA invoke finished. Status: {response['status']}. Duration: {response['metadata']['processing_duration_ms']:.2f} ms")
        return response

    # --- Research & Discovery (Reworked Async) ---
    async def run_discovery_cycle(self, trace_id: str) -> Dict:
        """ Runs active discovery monitors asynchronously. """
        self.logger.info(f"Starting external discovery cycle (Trace: {trace_id})...")
        now_dt = datetime.datetime.now(datetime.timezone.utc); now_iso = now_dt.isoformat()
        triggered_monitors = []; tasks = []
        # --- TODO: Load self.discovery_monitors from persistent state ---
        for monitor_id, monitor in self.discovery_monitors.items():
            if monitor.get("status") != "active": continue
            frequency = monitor.get("frequency_hours", 6); last_checked_iso = monitor.get("last_checked"); should_run = True
            if last_checked_iso:
                try: last_checked_dt = datetime.datetime.fromisoformat(last_checked_iso.replace('Z', '+00:00')) # Handle Z timezone
                except ValueError: last_checked_dt = None
                if last_checked_dt and (now_dt - last_checked_dt < datetime.timedelta(hours=frequency)): should_run = False
            if should_run:
                tasks.append(self._run_single_monitor(monitor_id, monitor, now_iso, trace_id))
                triggered_monitors.append(monitor_id)

        run_summary = {"triggered_count": len(triggered_monitors), "triggered_ids": triggered_monitors, "results": []}
        if tasks:
             results_or_exceptions = await asyncio.gather(*tasks, return_exceptions=True)
             processed_results = []
             for i, res in enumerate(results_or_exceptions):
                 monitor_id = triggered_monitors[i]
                 if isinstance(res, Exception):
                     logger.error(f"Monitor {monitor_id} failed: {res}")
                     processed_results.append({"monitor_id": monitor_id, "status": "error", "error": str(res)})
                 elif isinstance(res, dict):
                     processed_results.append(res)
                     # Update last_checked only on success
                     if res.get("status") == "success":
                         self.discovery_monitors[monitor_id]["last_checked"] = now_iso
                 else: # Should not happen if _run_single_monitor returns dict
                      processed_results.append({"monitor_id": monitor_id, "status": "error", "error": "Unknown result type"})
             run_summary["results"] = processed_results
        # --- TODO: Persist self.discovery_monitors state ---
        self.logger.info(f"External discovery cycle finished (Trace: {trace_id}). Triggered {len(triggered_monitors)} monitors.")
        return run_summary

    async def _run_single_monitor(self, monitor_id: str, monitor: Dict, run_time_iso: str, trace_id: str) -> Dict:
        """ Executes a single monitor asynchronously using Tool API proxies. """
        run_log = {"monitor_id": monitor_id, "trace_id": trace_id, "timestamp": run_time_iso, "status": "started", "steps": {}}
        kg_tool = self.tools.get("kg")
        fm_tool = self.tools.get("fm")
        if not kg_tool or not fm_tool:
            return {**run_log, "status": "error", "error": "KG Tool or FM Client Tool proxy unavailable"}

        try:
            # 1. Fetch External Data (Placeholder - Needs dedicated Tool)
            step_start = time.monotonic()
            # --- TODO: Replace placeholder with API call to ExternalDataFetcherTool ---
            # fetch_request = {"payload": {"source_type": monitor["source_type"], "query_or_key": monitor["query_or_key"]}, "trace_id": trace_id}
            # fetch_response = await external_data_fetcher_proxy(request=fetch_request)
            # raw_findings = fetch_response.get("payload", {}).get("findings", []) if fetch_response.get("status") == "success" else []
            raw_findings = await self._fetch_external_data(monitor["source_type"], monitor["query_or_key"]) # Using placeholder
            # --- End TODO ---
            run_log["steps"]["fetch_data"] = {"duration_ms": (time.monotonic() - step_start) * 1000, "findings_count": len(raw_findings)}
            if not raw_findings: run_log["status"] = "no_new_findings"; return run_log

            # 2. Process Findings (via FM Client Tool API Proxy)
            step_start = time.monotonic()
            processed_insights = await self._process_findings(raw_findings, monitor.get("processing_pipeline", []), fm_tool, trace_id)
            run_log["steps"]["process_findings"] = {"duration_ms": (time.monotonic() - step_start) * 1000, "processed_count": len(processed_insights)}
            if not processed_insights: run_log["status"] = "no_insights_processed"; return run_log

            # 3. Integrate Insights (via KG Tool API Proxy)
            step_start = time.monotonic()
            integration_results = await self._integrate_insights(processed_insights, f"discovery:{monitor_id}", kg_tool, trace_id)
            run_log["steps"]["integrate_insights"] = {"duration_ms": (time.monotonic() - step_start) * 1000, **integration_results}
            run_log["status"] = "success" if integration_results.get("integrated_count", 0) > 0 else "integration_failed"

        except Exception as e:
            logger.error(f"Error running monitor {monitor_id} async: {e}", exc_info=True)
            run_log["status"] = "error"; run_log["error"] = str(e)

        self.agent_history.append(run_log) # Append to in-memory history
        return run_log

    async def _fetch_external_data(self, source_type: str, query_or_key: str) -> List[Dict]:
        """ Placeholder: Should call a dedicated External Data Fetcher Tool/Service via API. """
        self.logger.debug(f"Fetching external data async: Type={source_type} (Placeholder - Needs dedicated tool call)")
        # --- Placeholder Logic ---
        await asyncio.sleep(random.uniform(0.05, 0.2)) # Simulate API call latency
        if source_type == 'api':
            return [{"id": f"api_{uuid.uuid4().hex[:6]}", "title": f"Simulated API Result for {query_or_key}", "content": "Content from API...", "source": query_or_key, "publishedAt": datetime.now().isoformat()}]
        if source_type == 'web_search':
            return [{"id": f"web_{uuid.uuid4().hex[:6]}", "title": f"Simulated Web Result for {query_or_key}", "snippet": "Snippet from web search...", "url": f"http://example.com/{uuid.uuid4().hex[:6]}", "source": "web_search"}]
        return []

    async def _process_findings(self, findings: List[Dict], pipeline: List[str], fm_tool: Any, trace_id: str) -> List[Dict]:
        """ Process raw findings using FM Client Tool API via proxy. """
        if not fm_tool: logger.error("FM Client Tool proxy unavailable for processing findings."); return []
        model_alias = self.config.foundation_models.defaults.llama4_scout # Use scout for processing

        async def process_single(finding: Dict) -> Optional[Dict]:
            text = finding.get('content') or finding.get('snippet') or finding.get('title') or ''
            finding_id = finding.get('id') or finding.get('url') or f"f_{uuid.uuid4().hex[:6]}"
            processed = {"original_id": finding_id, **finding, "processing_error": None}
            if not text: return processed # Return original if no text to process

            try:
                for step in pipeline:
                    fm_request = {
                        "payload": {"model_alias": model_alias},
                        "trace_id": trace_id, "request_id": f"fm_{step}_{finding_id}"
                    }
                    if step == 'summarize':
                        fm_request["payload"]["prompt"] = f"Summarize the following content:\n{text}"
                        fm_request["payload"]["max_tokens"] = 150 # Example length
                        fm_response = await fm_tool.generate_text(input_data=fm_request) # API Call via proxy
                        if fm_response.get("status") == "success": processed['summary_ai'] = fm_response.get("payload", {}).get("generated_text")
                        else: raise RuntimeError(f"Summarize API call failed: {fm_response.get('error_details')}")
                    elif step == 'extract_entities':
                        fm_request["payload"]["prompt"] = f"Extract key entities (people, orgs, locations, topics) from:\n{text}\nOutput as JSON list: [{{'entity': '...', 'type': '...'}}]"
                        fm_response = await fm_tool.generate_text(input_data=fm_request) # API Call via proxy
                        if fm_response.get("status") == "success":
                            try: processed['entities_ai'] = json.loads(fm_response.get("payload", {}).get("generated_text", "[]"))
                            except json.JSONDecodeError: processed['entities_ai'] = [{"error": "Invalid JSON from LLM"}]; logger.warning("Failed to parse JSON entities from LLM.")
                        else: raise RuntimeError(f"Extract Entities API call failed: {fm_response.get('error_details')}")
                    # Add other processing steps (e.g., sentiment analysis via fm_tool.analyze)
                return processed
            except Exception as e:
                logger.error(f"Error processing finding async '{finding_id}': {e}")
                processed['processing_error'] = str(e)
                return processed

        tasks = [process_single(f) for f in findings if isinstance(f, dict)]
        results = await asyncio.gather(*tasks)
        return [r for r in results if r is not None]

    async def _integrate_insights(self, insights: List[Dict], source_prefix: str, kg_tool: Any, trace_id: str) -> Dict:
        """ Integrates insights into KG via KG Tool API proxy. """
        if not kg_tool or not hasattr(kg_tool, 'add_entities_bulk_endpoint'):
            logger.error("KG Tool API proxy unavailable or missing 'add_entities_bulk_endpoint' method.")
            return {"integrated_count": 0, "failed_count": len(insights)}

        kg_entities = []
        for insight in insights:
             # Create a unique ID for the insight node itself
             insight_node_id = insight.get('original_id') or f"{source_prefix}:{uuid.uuid4().hex[:8]}"
             hints = {"type": "ExternalInsight", "source": source_prefix, "original_id": insight_node_id}
             # Prepare properties for the insight node
             entity_dict = {
                 "_resolution_hints": hints,
                 "mizId": insight_node_id, # Use the generated ID as mizId
                 "type": "ExternalInsight",
                 "title": insight.get("title"),
                 "link": insight.get("url"),
                 "source": source_prefix,
                 "published_at": _parse_date(insight.get("publishedAt")), # Use helper
                 "summary_ai": insight.get("summary_ai"),
                 # Store extracted entities as a JSON string or list property
                 "entities_ai_json": json.dumps(insight.get("entities_ai", []), default=str),
                 "processing_error": insight.get("processing_error"),
                 "processed_at": datetime.now(datetime.timezone.utc).isoformat()
             }
             kg_entities.append(entity_dict)
             # TODO: Optionally create nodes for extracted entities_ai and link them to the insight node

        if not kg_entities:
            return {"integrated_count": 0, "failed_count": 0}

        self.logger.info(f"Integrating {len(kg_entities)} insights into KG via Tool API from {source_prefix}...")
        try:
            # Call KG Tool API endpoint via proxy (MIZ OKI payload)
            kg_request = {
                "payload": {"entities": kg_entities, "source": source_prefix},
                "trace_id": trace_id, "request_id": f"kg_integrate_{source_prefix}"
            }
            kg_response = await kg_tool.add_entities_bulk_endpoint(request=kg_request) # Call proxy method

            # Parse MIZ OKI response from KG Tool
            if kg_response.get("status") in ["success", "partial_success"]:
                kg_payload = kg_response.get("payload", {})
                integrated_count = kg_payload.get("new", 0) + kg_payload.get("updated", 0)
                failed_count = kg_payload.get("failed", 0)
                if failed_count > 0: logger.warning(f"KG integration via Tool API: {failed_count} failures reported by KG Tool.")
                self.logger.info(f"Insight integration via Tool API Complete. Successful: {integrated_count}, Failed: {failed_count}")
                return {"integrated_count": integrated_count, "failed_count": failed_count}
            else:
                raise RuntimeError(f"KG Tool Bulk API call failed: {kg_response.get('error_details')}")
        except Exception as e:
            logger.error(f"Failed to call KG Tool Bulk API via proxy: {e}", exc_info=True)
            return {"integrated_count": 0, "failed_count": len(kg_entities)}

    # --- B.O.S.S. Self-Teaching Loop (Reworked Async) ---
    async def run_boss_cycle(self, trace_id: str) -> Dict:
        """ Executes one cycle of the B.O.S.S. loop asynchronously. """
        self.logger.info(f"Starting B.O.S.S. cycle (Trace: {trace_id})...")
        cycle_log = {"cycle_id": trace_id, "timestamp_start": datetime.now(datetime.timezone.utc).isoformat(), "status": "started", "steps": {}}
        kg_tool = self.tools.get("kg")
        kd_tool = self.tools.get("kd")
        if not kg_tool or not kd_tool or not self.pubsub_client:
            error_msg = "Missing dependencies (KG Tool, KD Tool, or PubSub Client proxies)."
            logger.error(f"B.O.S.S. (Trace: {trace_id}): {error_msg}")
            return {**cycle_log, "status": "error", "error": error_msg}

        try:
            # 1. Identify Gaps (Call KG Tool API Proxy)
            start_step = time.monotonic()
            gaps = await self._identify_knowledge_gaps(kg_tool, trace_id)
            cycle_log["steps"]["identify_gaps"] = {"duration_ms": (time.monotonic()-start_step)*1000, "status": "success", "gaps_found": len(gaps)}
            if not gaps:
                self.logger.info(f"B.O.S.S. (Trace: {trace_id}): No significant knowledge gaps identified."); cycle_log["status"] = "no_gaps"; return cycle_log

            # 2. Prioritize Gap (Simple: take the first one)
            # TODO: Implement more sophisticated prioritization logic
            selected_gap = gaps[0]
            cycle_log["selected_gap"] = selected_gap
            self.logger.info(f"B.O.S.S. (Trace: {trace_id}): Selected gap: {selected_gap.get('description')}")

            # 3. Research Gap (Call External Fetcher Tool -> FM Client Tool API Proxy)
            start_step = time.monotonic()
            research_findings = await self._trigger_subagent_research(selected_gap.get('description'), trace_id)
            cycle_log["steps"]["research"] = {"duration_ms": (time.monotonic()-start_step)*1000, "status": "success", "findings_count": len(research_findings)}
            if not research_findings:
                self.logger.warning(f"B.O.S.S. (Trace: {trace_id}): Research yielded no findings for gap '{selected_gap.get('description')}'."); cycle_log["status"] = "research_failed"; return cycle_log

            # 4. Synthesize & Trigger Training (Call KD Tool API Proxy -> Pub/Sub Client Proxy)
            start_step = time.monotonic()
            mini_model_info = await self._generate_and_trigger_mini_model_training(selected_gap, research_findings, kd_tool, trace_id)
            cycle_log["steps"]["synthesize_trigger"] = {"duration_ms": (time.monotonic()-start_step)*1000}
            if not mini_model_info or mini_model_info.get("status") != "training_triggered":
                 self.logger.error(f"B.O.S.S. (Trace: {trace_id}): Mini-model generation/trigger failed."); cycle_log["status"] = "synthesis_failed"; cycle_log["steps"]["synthesize_trigger"]["status"] = "failed"; return cycle_log
            cycle_log["steps"]["synthesize_trigger"].update(mini_model_info)
            self.logger.info(f"B.O.S.S. (Trace: {trace_id}): Triggered mini-model training via Pub/Sub ({mini_model_info.get('pipeline_trigger_message_id')}).")
            cycle_log["status"] = "training_triggered"

        except Exception as e:
            logger.error(f"Error during B.O.S.S. cycle (Trace: {trace_id}): {e}", exc_info=True)
            cycle_log["status"] = "error"; cycle_log["error"] = str(e)

        self.agent_history.append(cycle_log) # Append to in-memory history
        return cycle_log

    async def _identify_knowledge_gaps(self, kg_tool: Any, trace_id: str) -> List[Dict]:
        """ Identifies knowledge gaps by querying the KG Tool API proxy. """
        self.logger.debug(f"B.O.S.S.: Identifying knowledge gaps via KG Tool API...")
        # Conceptual Query (adjust based on actual KG schema for tasks/models/performance)
        # This query looks for TaskDefinition nodes that either have no associated Model
        # or where the average performance score of associated Models is below a threshold (e.g., 0.6).
        query = """
        MATCH (t:TaskDefinition)
        OPTIONAL MATCH (t)<-[:PERFORMS_TASK]-(m:Model) // Assuming Model nodes have performance_score
        WITH t, count(m) AS modelCount, avg(m.performance_score) AS avgModelPerf
        WHERE modelCount = 0 OR avgModelPerf < $performance_threshold
        RETURN t.description AS description,
               t.domain AS domain,
               t.required_task_type AS potential_task_type,
               // Calculate a gap score (higher for missing models or lower performance)
               (CASE WHEN modelCount = 0 THEN 1.0 ELSE ($performance_threshold - avgModelPerf) / $performance_threshold END) AS gap_score
        ORDER BY gap_score DESC
        LIMIT 5 // Limit the number of gaps returned
        """
        params = {"performance_threshold": 0.6} # Example threshold
        try:
             # Call KG Tool API proxy (MIZ OKI payload)
             kg_request = {
                 "payload": {"query": query, "parameters": params},
                 "trace_id": trace_id, "request_id": f"kg_find_gaps_{trace_id}"
             }
             kg_response = await kg_tool.execute_query(request=kg_request) # Call proxy method

             if kg_response.get("status") == "success":
                 gaps = kg_response.get("payload", {}).get("results", [])
                 self.logger.info(f"B.O.S.S.: Found {len(gaps)} potential knowledge gaps.")
                 return gaps
             else:
                 logger.error(f"KG Tool API query for gaps failed: {kg_response.get('error_details')}")
                 return []
        except Exception as e:
            logger.error(f"Failed to query KG Tool API for gaps: {e}", exc_info=True)
            return []

    async def _trigger_subagent_research(self, gap_description: str, trace_id: str) -> List[Dict]:
        """ Triggers research via dedicated tools/agents (placeholder) and processes results. """
        self.logger.debug(f"B.O.S.S. (Trace: {trace_id}): Triggering research for gap: {gap_description}")
        # --- TODO: Replace placeholder with API call to a dedicated ResearchAgent/Tool ---
        # research_request = {"payload": {"query": f"Find information and techniques related to: {gap_description}"}, "trace_id": trace_id}
        # research_response = await research_agent_proxy(request=research_request)
        # findings = research_response.get("payload", {}).get("findings", []) if research_response.get("status") == "success" else []
        findings = await self._fetch_external_data(source_type='web_search', query_or_key=f"techniques for {gap_description}") # Using placeholder
        # --- End TODO ---
        if not findings: return []
        # Process findings using FM Client Tool API Proxy
        return await self._process_findings(findings, ['summarize', 'extract_entities'], self.tools.get("fm"), trace_id)

    async def _generate_and_trigger_mini_model_training(self, gap_details: Dict, research_findings: List[Dict], kd_tool: Any, trace_id: str) -> Optional[Dict]:
        """ Generates teacher outputs via KD Tool API proxy and triggers MLOps via Pub/Sub client proxy. """
        task_desc = gap_details.get('description', 'Unknown_Task'); task_type = gap_details.get('potential_task_type', 'classification'); domain = gap_details.get('domain', 'boss_gen')
        self.logger.info(f"B.O.S.S. (Trace: {trace_id}): Synthesizing mini-model for: {task_desc}")

        # Prepare input data for the teacher model (KD Tool)
        # Use summaries or relevant parts of research findings
        kd_input_items = [f.get('summary_ai', '') or f.get('content', '') or f.get('snippet', '') for f in research_findings]
        kd_input_items = [item for item in kd_input_items if item] # Filter out empty inputs
        if not kd_input_items:
            logger.warning("B.O.S.S.: No suitable content from research findings for KD teacher input.")
            return {"status": "failed", "error": "No input data for teacher model"}

        # --- TODO: Save full input dataset to GCS asynchronously ---
        # This requires a robust async GCS write implementation (e.g., using aio-gcsfs)
        # For now, we'll use a placeholder path and pass preview data to KD tool.
        temp_kd_input_uri = f"gs://{self.config.gcp.gcs_bucket_name}/kd_inputs/boss_{domain}_{task_type}_{uuid.uuid4().hex[:8]}.jsonl"
        logger.warning(f"B.O.S.S.: Placeholder - Full KD input data should be saved to {temp_kd_input_uri}")
        # --- End TODO ---

        student_model_name = f"boss_mini_{domain}_{task_type}_{uuid.uuid4().hex[:6]}"
        student_model_details = {"name": student_model_name, "architecture": self.config.get('learning_flows.kd.default_student_architecture', "distilbert-base-uncased"), "task_type": task_type}
        # Reference to the dataset (using placeholder URI and preview)
        dataset_ref_for_kd = {"gcs_uri": temp_kd_input_uri, "inputs_preview": kd_input_items[:10]} # Pass preview
        distillation_params = {"teacher_model_alias": self.config.get('learning_flows.kd.teacher_model_alias', self.config.foundation_models.defaults.llama4_maverick)}
        teacher_output_uri = None

        try: # Call KD Tool API via proxy
            kd_request = {
                "payload": {
                    "student_model_details": student_model_details,
                    "dataset_ref": dataset_ref_for_kd,
                    "distillation_params": distillation_params
                },
                "trace_id": trace_id, "request_id": f"kd_teacher_{student_model_name}"
            }
            kd_response = await kd_tool.distill_knowledge(request=kd_request) # Call proxy method

            if kd_response.get("status") == "success":
                teacher_output_uri = kd_response.get("payload", {}).get("teacher_output_path")
                if not teacher_output_uri: raise RuntimeError("KD Tool API succeeded but did not return teacher output path.")
                self.logger.info(f"B.O.S.S.: Teacher outputs generated by KD Tool API: {teacher_output_uri}")
            else:
                raise RuntimeError(f"KD Tool API failed: {kd_response.get('error_details')}")
        except Exception as kd_e:
            logger.error(f"B.O.S.S.: Error calling KD Tool API proxy: {kd_e}", exc_info=True)
            return {"status": "failed", "error": f"KD Tool API Error: {kd_e}"}

        # Trigger MLOps Pipeline via Pub/Sub Client Proxy
        pipeline_name = "miz3_expert_training_pipeline_v1deploy_apireg" # From Cell 17 (or config)
        # Parameters for the Vertex AI Pipeline job
        pipeline_params = {
            "project": self.project,
            "location": self.location,
            "model_display_name_prefix": student_model_name,
            "source_uri_or_query": temp_kd_input_uri, # Input data for student training
            "target_column": "teacher_prediction", # Column containing teacher outputs
            "task_type": task_type,
            "expert_domain": domain,
            "epochs": 10, # Example hyperparameter
            "teacher_output_uri": teacher_output_uri # Pass teacher outputs if needed by pipeline
            # Add other necessary pipeline parameters
        }
        # MIZ OKI formatted message for Pub/Sub
        message_data = {
            "miz_oki_version": self.config.miz_oki_schema_version,
            "event_type": "trigger_mlops_pipeline",
            "payload": {"pipeline_name": pipeline_name, "parameters": pipeline_params},
            "metadata": {"trace_id": trace_id, "source_component": self.agent_id, "student_model_name": student_model_name}
        }
        message_bytes = json.dumps(message_data).encode('utf-8')
        mlops_topic_full_path = f"projects/{self.project}/topics/{self.mlops_trigger_topic_name}"

        try:
             # Call Pub/Sub Client Proxy method
             message_id = await self.pubsub_client.publish(mlops_topic_full_path, message_bytes)
             self.logger.info(f"B.O.S.S. (Trace: {trace_id}): Triggered MLOps training via Pub/Sub proxy. Topic: {mlops_topic_full_path}, Message ID: {message_id}")
             return {"status": "training_triggered", "student_model_name": student_model_name, "pipeline_trigger_message_id": message_id}
        except Exception as pub_e:
            logger.error(f"B.O.S.S. (Trace: {trace_id}): Failed to publish MLOps trigger via Pub/Sub proxy: {pub_e}", exc_info=True)
            return {"status": "failed", "error": f"PubSub Proxy Error: {pub_e}"}

    # --- Autonomous Experimentation (Reworked Async) ---
    async def run_experimentation_cycle(self, trace_id: str) -> Dict:
        """ Designs, launches, and potentially triggers analysis of experiments using Vertex AI Workflows client proxy. """
        self.logger.info(f"Starting experimentation cycle (Trace: {trace_id})...")
        # --- TODO: Implement logic to identify opportunities ---
        # Example: Query KG Tool API for underperforming campaigns, user segments, or processes
        # kg_request = {"payload": {"query": "FIND_EXPERIMENT_OPPORTUNITIES_QUERY"}, "trace_id": trace_id}
        # kg_response = await self.tools.get("kg").execute_query(request=kg_request)
        # opportunities = kg_response.get("payload", {}).get("results", [])
        opportunities = [{"goal": "Improve conversion rate for segment X", "metric": "conversion_rate"}] # Placeholder
        # --- End TODO ---

        if opportunities:
             opportunity = opportunities[0] # Select first opportunity for now
             goal_desc = opportunity.get("goal", "Improve performance")
             target_metric = opportunity.get("metric", "unknown_metric")
             # --- TODO: Define control/variations based on opportunity ---
             control_query = "MATCH (u:User {segment:'X'}) RETURN u.mizId" # Example control group
             variations = [{"type":"hp_variant", "param":0.5}, {"type":"hp_variant", "param":0.8}] # Example variations
             # --- End TODO ---

             exp_id = await self.design_experiment(goal_desc, target_metric, control_query, variations, trace_id)
             if exp_id:
                  launch_result = await self.launch_experiment(exp_id)
                  if launch_result.get("status") == "running":
                       self.logger.info(f"Experiment {exp_id} launched (Exec ID: {launch_result.get('execution_id')}). Analysis handled by separate workflow/schedule.")
                       return {"status": "experiment_launched", "experiment_id": exp_id, "execution_id": launch_result.get('execution_id')}
                  else:
                      return {"status": "launch_failed", "experiment_id": exp_id, "error": launch_result.get("error")}
             else:
                 return {"status": "design_failed"}
        else:
            self.logger.info(f"Experimentation cycle (Trace: {trace_id}): No suitable opportunities found.")
            return {"status": "no_experiment_opportunity"}

    async def design_experiment(self, goal_desc: str, target_metric: str, control_query: str, variations: List[Dict], trace_id: str) -> Optional[str]:
        """ Designs experiment structure (Placeholder logic). Needs state persistence. """
        self.logger.info(f"Designing experiment (Trace: {trace_id}): {goal_desc}")
        exp_id = f"exp_{uuid.uuid4().hex[:8]}"
        # --- TODO: Use FM Client API proxy for design refinement if needed ---
        # fm_request = {"payload": {"prompt": f"Refine experiment design for goal: {goal_desc}...", ...}, "trace_id": trace_id}
        # fm_response = await self.tools.get("fm").generate_text(input_data=fm_request)
        # refined_design = json.loads(fm_response.get("payload", {}).get("generated_text", "{}"))
        # --- End TODO ---
        design = {
            "id": exp_id,
            "goal": goal_desc,
            "metric": target_metric,
            "status": "designed",
            "control_query": control_query, # Query to identify control group in KG
            "variants": variations, # Description of variations to test
            "trace_id": trace_id,
            "created_at": datetime.now(datetime.timezone.utc).isoformat()
        }
        self.experiments[exp_id] = design
        # --- TODO: Persist self.experiments state ---
        await self.save_state() # Conceptual call
        # --- End TODO ---
        self.logger.info(f"Experiment {exp_id} designed (Trace: {trace_id}).")
        return exp_id

    async def launch_experiment(self, experiment_id: str) -> Dict:
        """ Triggers the Vertex AI Workflow for experiment execution using the REAL client proxy. """
        # --- TODO: Load self.experiments state ---
        if experiment_id not in self.experiments or self.experiments[experiment_id]["status"] != "designed":
            return {"status": "error", "error": "Experiment not found or not in designed state."}
        if not self.workflow_client:
            return {"status": "error", "error": "Workflow client proxy unavailable."}

        trace_id = self.experiments[experiment_id].get("trace_id", f"exp_launch_{experiment_id}")
        self.logger.info(f"Triggering Vertex AI Workflow to launch experiment {experiment_id} (Trace: {trace_id})...")
        try:
            workflow_id = self.config.vertex_ai.experiment_execution_workflow_id
            if not workflow_id: raise ConfigurationError("Experiment execution workflow ID not configured.")

            # Prepare MIZ OKI structured input for the workflow
            workflow_input_payload = {"experiment_id": experiment_id, "experiment_design": self.experiments[experiment_id]}
            miz_oki_input = {
                "miz_oki_version": self.config.miz_oki_schema_version,
                "request_id": f"req_exec_{experiment_id}",
                "trace_id": trace_id,
                "source_component": self.agent_id,
                "target_component": workflow_id, # Target is the workflow itself
                "payload": workflow_input_payload
            }
            # Use real Vertex client proxy method (assuming start_workflow wraps create_execution)
            execution_name = await self.workflow_client.start_workflow(
                project=self.project,
                location=self.location,
                workflow_id=workflow_id,
                miz_oki_input=miz_oki_input # Pass the structured input
            )
            if execution_name:
                self.experiments[experiment_id]["status"] = "running"
                self.experiments[experiment_id]["start_date"] = datetime.now(datetime.timezone.utc).isoformat()
                self.experiments[experiment_id]["execution_id"] = execution_name
                # --- TODO: Persist self.experiments state ---
                await self.save_state() # Conceptual call
                # --- End TODO ---
                self.logger.info(f"Experiment execution workflow started for {experiment_id}. Vertex Exec Name: {execution_name}")
                return {"status": "running", "execution_id": execution_name}
            else:
                raise RuntimeError("Vertex AI client proxy failed to start experiment execution workflow.")
        except ConfigurationError as conf_e:
             logger.error(f"Configuration error launching experiment {experiment_id}: {conf_e}")
             return {"status": "error", "error": str(conf_e)}
        except Exception as e:
             logger.error(f"Error launching experiment {experiment_id} workflow via proxy: {e}", exc_info=True)
             return {"status": "error", "error": str(e)}

    async def analyze_experiment_results(self, experiment_id: str): # Kept for conceptual completeness
        """ Triggers the Vertex AI Workflow for experiment analysis via REAL client proxy. """
        # --- TODO: Load self.experiments state ---
        if experiment_id not in self.experiments: return None
        if not self.workflow_client: return None

        trace_id = self.experiments[experiment_id].get("trace_id", f"exp_analyze_{experiment_id}")
        self.logger.info(f"Triggering Vertex AI Workflow to analyze experiment {experiment_id} (Trace: {trace_id})...")
        try:
            workflow_id = self.config.vertex_ai.experiment_analysis_workflow_id
            if not workflow_id: raise ConfigurationError("Experiment analysis workflow ID not configured.")

            workflow_input_payload = {"experiment_id": experiment_id, "execution_id": self.experiments[experiment_id].get("execution_id")}
            miz_oki_input = {
                "miz_oki_version": self.config.miz_oki_schema_version,
                "request_id": f"req_analyze_{experiment_id}", "trace_id": trace_id,
                "source_component": self.agent_id, "target_component": workflow_id,
                "payload": workflow_input_payload
            }
            analysis_execution_name = await self.workflow_client.start_workflow(
                project=self.project, location=self.location, workflow_id=workflow_id, miz_oki_input=miz_oki_input
            )
            if analysis_execution_name:
                 self.experiments[experiment_id]["status"] = "analysis_pending"
                 self.experiments[experiment_id]["analysis_execution_id"] = analysis_execution_name
                 # --- TODO: Persist self.experiments state ---
                 await self.save_state() # Conceptual call
                 # --- End TODO ---
                 self.logger.info(f"Experiment analysis workflow started for {experiment_id}. Analysis Exec Name: {analysis_execution_name}")
                 return analysis_execution_name
            else:
                raise RuntimeError("Vertex AI client proxy failed to start experiment analysis workflow.")
        except ConfigurationError as conf_e:
             logger.error(f"Configuration error launching analysis for {experiment_id}: {conf_e}")
             return None
        except Exception as e:
             logger.error(f"Error launching experiment analysis workflow for {experiment_id} via proxy: {e}", exc_info=True)
             return None

# --- Initialization (Conceptual for ADK Agent/Tool) ---
# This would happen in the service's main entry point or factory

# moe_registry_manager: Optional[MixtureOfExpertsRegistryManager] = None
# aka_agent_logic: Optional[AutonomousKnowledgeAgentLogic] = None

# async def initialize_foundational_layer():
#     global moe_registry_manager, aka_agent_logic
#     if not _config_obj:
#         logger.critical("Cannot initialize Foundational Layer: CONFIG_OBJ not loaded.")
#         return
#     if not _real_dependencies:
#          logger.warning("Initializing Foundational Layer with MOCKED dependencies.")

#     try:
#         moe_registry_manager = MixtureOfExpertsRegistryManager(_config_obj)
#         await moe_registry_manager.initialize()

#         aka_agent_logic = AutonomousKnowledgeAgentLogic(
#             agent_id="AKA_001_Reworked",
#             config=_config_obj,
#             kg_tool_proxy=_kg_tool_proxy, # Injected proxy
#             fm_client_proxy=_fm_client_proxy, # Injected proxy
#             kd_tool_proxy=_kd_tool_proxy, # Injected proxy
#             moe_registry=moe_registry_manager,
#             workflow_client_proxy=_workflow_client_proxy, # Injected client/proxy
#             pubsub_client_proxy=_pubsub_client_proxy # Injected client/proxy
#         )
#         # In a real deployment, state would be loaded here or by the ADK framework
#         await aka_agent_logic.setup(initial_state={})

#         logger.info("Foundational Layer (AKA Logic) initialized.")
#     except Exception as e:
#          logger.critical(f"Foundational Layer initialization failed: {e}", exc_info=True)
#          moe_registry_manager = None; aka_agent_logic = None

# async def cleanup_foundational_layer():
#      if moe_registry_manager: await moe_registry_manager.cleanup()
#      # Agent cleanup might be handled by ADK framework

# --- Example Usage (Conceptual - How a workflow step might call this Tool) ---
# async def workflow_step_call_aka(workflow_input: Dict):
#      # Assuming workflow_input contains the MIZ OKI payload for the AKA tool
#      if aka_agent_logic:
#          aka_response = await aka_agent_logic.invoke(input_data=workflow_input)
#          return aka_response # Return MIZ OKI response to workflow
#      else:
#          # Handle case where tool isn't initialized
#          return {"status": "error", "error_details": [{"message": "AKA Tool not available"}]}


print("\n--- MIZ 3.0 Foundational Layer Logic (Cell 4 - Reworked) ---")
print("AKA logic uses real dependencies/proxies and triggers real clients.")
print("Handles MIZ OKI payloads for API interaction.")
print("Requires implementation of KG queries, external fetch tool, MoE API client.")
print("State persistence needs explicit implementation or ADK framework support.")
print("-----------------------------------------------------------------")

SyntaxError: invalid syntax (<ipython-input-5-a6d709cfab81>, line 67)

In [9]:
# Cell 5: Core Processes Layer Implementation (Reworked)
# Status: Tool logic uses real dependencies/proxies via MIZ OKI APIs/Events.
#         Interactions via MIZ OKI payloads. Async implementation.
#         Placeholders remain for Causal/Sim logic, config/rule loading, persistence.

import time
import datetime
import logging
import numpy as np
import pandas as pd
import random
import json
import uuid
from typing import Dict, Any, Optional, List, Union, Callable, Tuple
from collections import deque, defaultdict, Counter # Added Counter
import asyncio

# --- Assume Real Tool/Client Dependencies are Injected/Available ---
# These proxies represent API clients for other deployed MIZ OKI services or GCP clients.
try:
    # Use CONFIG_OBJ loaded in Cell 1
    if 'CONFIG_OBJ' not in globals() or not CONFIG_OBJ: raise NameError("CONFIG_OBJ not found or is None")
    if not isinstance(CONFIG_OBJ, EnhancedConfig): raise NameError("CONFIG_OBJ is not an EnhancedConfig instance")

    # Proxies for other MIZ OKI Tool APIs
    if 'kg_tool_service_instance' not in globals(): raise NameError("kg_tool_service_instance proxy not found") # Cell 3 Proxy
    if 'moe_registry_manager' not in globals(): raise NameError("moe_registry_manager proxy not found") # Cell 4 Instance/Proxy
    if 'foundation_model_client' not in globals(): raise NameError("foundation_model_client proxy not found") # Cell 18 Proxy
    if 'expert_invoker' not in globals(): raise NameError("expert_invoker proxy not found") # Needs definition/mock

    # Real/Mock Clients for GCP Services
    if '_workflow_executions_client' not in globals(): raise NameError("_workflow_executions_client not found") # Cell 16 Client Proxy
    if '_pubsub_client' not in globals(): raise NameError("_pubsub_client not found") # Cell 8 Client Proxy

    _config_obj = CONFIG_OBJ
    _kg_tool_proxy = kg_tool_service_instance
    _moe_registry_proxy = moe_registry_manager
    _expert_invoker_proxy = expert_invoker
    _fm_client_proxy = foundation_model_client
    _workflow_client_proxy = _workflow_executions_client # Use client proxy from Cell 16
    _pubsub_client_proxy = _pubsub_client # Use client proxy from Cell 8
    _real_dependencies = True
    logger.debug("Using real/conceptual dependencies in Cell 5 (Reworked).")

except NameError as e:
    logger.warning(f"Dependency Error in Cell 5 ({e}). Using Mocks/Placeholders.")
    _real_dependencies = False
    # --- Mock/Placeholder Setup ---
    class MockKGTool: async def get_entity(self, request): return {"status": "success", "payload": {"entity_data": {"entity_type": "mock"}}}; async def execute_query(self, request): return {"status": "success", "payload": {"results": []}}; async def save_decision_record(self, request): return {"status": "success"}
    class MockMoERegistryManager: async def find_expert_for_task(self, *args, **kwargs): return "mock_expert_id"; async def get_expert_details(self, *args, **kwargs): return {"endpoint": "http://mock"}
    class MockExpertInvoker: async def invoke(self, request): await asyncio.sleep(0.01); return {"status": "success", "payload": {"prediction": [0.5]}}
    class MockFMClientTool: async def generate_text(self, input_data): return {"status": "success", "payload": {"generated_text": json.dumps({"action_type": "mock_action"})}}
    class MockVertexWorkflowClient: async def start_workflow(self, project, location, workflow_id, miz_oki_input): return f"projects/{project}/locations/{location}/workflows/{workflow_id}/executions/exec_{uuid.uuid4().hex[:8]}" # Return full name
    class MockPubSubClient: async def publish(self, topic, data_bytes): return f"msg_{uuid.uuid4().hex[:8]}"
    # Define minimal config if needed
    if 'CONFIG_OBJ' not in globals() or not CONFIG_OBJ:
        from dataclasses import dataclass, field
        @dataclass class MockGcpConfig: project_id:Optional[str]="mock-proj"; region:str="mock-region"
        @dataclass class MockSysThresholds: decision_confidence_threshold: float = 0.8; optimization_threshold: float = 0.7; goal_generation_threshold: float = 0.6
        @dataclass class MockFmDefaults: llama4_maverick: str = "mock-llama"; llama4_scout: str = "mock-scout"
        @dataclass class MockFmConfig: defaults: MockFmDefaults = field(default_factory=MockFmDefaults)
        @dataclass class MockVertexAIConfig: planning_workflow_id: str = "mock-planning-wf"
        @dataclass class MockBusinessImpact: kpis: Dict = field(default_factory=dict)
        @dataclass class MockConfig: gcp: MockGcpConfig = field(default_factory=MockGcpConfig); system_thresholds: MockSysThresholds = field(default_factory=MockSysThresholds); mlops_trigger_topic:str="mock-topic"; foundation_models: MockFmConfig = field(default_factory=MockFmConfig); vertex_ai: MockVertexAIConfig = field(default_factory=MockVertexAIConfig); business_impact: MockBusinessImpact = field(default_factory=MockBusinessImpact); miz_oki_schema_version: str = "3.0"; def get(self, key, default=None): parts=key.split('.'); val=self; try: [val := getattr(val, p) for p in parts]; return val; except: return default
        _config_obj = MockConfig()

    _kg_tool_proxy = MockKGTool(); _moe_registry_proxy = MockMoERegistryManager(); _expert_invoker_proxy = MockExpertInvoker(); _fm_client_proxy = MockFMClientTool(); _workflow_client_proxy = MockVertexWorkflowClient(); _pubsub_client_proxy = MockPubSubClient()
    # Define dependent tools used here, using mocks if necessary
    class MockOptimizerTool: async def get_current_objective_priorities(self, input_data): return {"status": "success", "payload": {'priorities': {'ROAS': 0.6}}}; objectives = {'ROAS': {'metrics': [{'name': 'roas'}]}}; metric_history = defaultdict(lambda: deque(maxlen=10)); baselines={'roas': 3}; targets={'roas': 8}; forecasting_models={'roas': 'mock_forecaster'}; def _evaluate_objectives(self, state): return {'ROAS': 0.5} # Simplified mock
    _optimizer_tool_proxy = MockOptimizerTool()
    class MockHdeTool: async def make_decision(self, input_data): return {"status": "success", "payload": {"decision_id": "mock_dec", "action_recommended": True, "final_decision": {"action_type": "test_action"}}}; async def get_history(self, request): return {"status": "success", "payload": {"history": []}}; async def update_decision_log(self, request): return {"status": "success"}
    _hde_tool_proxy = MockHdeTool()
    class MockLITool: async def integrate_learning(self, input_data): return {"status": "success", "payload": {"integration_id": "mock_li"}}
    _li_tool_proxy = MockLITool()
    # --- End Mock/Placeholder Setup ---

logger = logging.getLogger('MIZ-OKI.CoreProcesses')

# --- Causal/Simulation Tool Placeholders (Refined Async - Deployed as Services) ---
class CausalReasoningTool:
    """Placeholder for the Causal Reasoning Tool Service."""
    def __init__(self, config: EnhancedConfig, kg_tool_proxy: Any):
        self.config = config; self.kg_tool = kg_tool_proxy
        self.logger = logging.getLogger('MIZ-OKI.CausalTool')
        self.logger.info("Causal Reasoning Tool logic initialized (Placeholder).")

    async def estimate_effect(self, input_data: Dict[str, Any]) -> Dict[str, Any]:
        """Estimates causal effect. Expects/Returns MIZ OKI."""
        start_time = time.monotonic(); errors = []
        payload = input_data.get("payload", {}); effect_query = payload.get("effect_query", {}); context = payload.get("context", {})
        trace_id = input_data.get("trace_id"); request_id = input_data.get("request_id")
        status = "pending"; response_payload = None

        target = effect_query.get('target_variable'); treatment = effect_query.get('treatment_variable')
        if not target or not treatment: errors.append({"code": "MISSING_PARAMS", "message": "target_variable and treatment_variable required in effect_query."})
        if errors: status = "bad_request"
        else:
            self.logger.info(f"[CAUSAL TOOL] Simulating async causal effect estimation via Tool API: {treatment} -> {target}")
            # --- TODO: Implement real causal inference ---
            # 1. Fetch relevant data via KG Tool API proxy based on query and context.
            # 2. Apply causal discovery/estimation algorithm (e.g., DoWhy, EconML).
            # 3. Handle confounding variables.
            await asyncio.sleep(random.uniform(0.1, 0.3)) # Simulate computation
            # --- End TODO ---
            status = "success"
            response_payload = {"effect_size": random.uniform(-0.1, 0.2), "confidence": random.uniform(0.6, 0.95), "query": effect_query, "_info": "Simulated Async Causal Result"}

        response = { # Manual MIZ OKI response construction
            "miz_oki_version": self.config.miz_oki_schema_version, "request_id": request_id, "trace_id": trace_id,
            "timestamp": datetime.now(datetime.timezone.utc).isoformat(), "source_component": "CausalReasoningTool",
            "target_component": input_data.get("source_component"), "status": status, "payload": response_payload,
            "error_details": errors if errors else None, "metadata": {"processing_duration_ms": (time.monotonic() - start_time) * 1000}
        }
        return response

class SimulationTool:
    """Placeholder for the Business Simulation Tool Service."""
    def __init__(self, config: EnhancedConfig, kg_tool_proxy: Any, fm_client_proxy: Optional[Any] = None):
        self.config = config; self.kg_tool = kg_tool_proxy; self.fm_client = fm_client_proxy
        self.logger = logging.getLogger('MIZ-OKI.SimulationTool')
        self.logger.info("Simulation Tool logic initialized (Placeholder).")

    async def run_scenario(self, input_data: Dict[str, Any]) -> Dict[str, Any]:
        """Runs a simulation scenario. Expects/Returns MIZ OKI."""
        start_time = time.monotonic(); errors = []
        payload = input_data.get("payload", {}); scenario_config = payload.get("scenario_config", {}); context = payload.get("context", {})
        trace_id = input_data.get("trace_id"); request_id = input_data.get("request_id")
        status = "pending"; response_payload = None

        scenario_name = scenario_config.get('name', f"sim_{uuid.uuid4().hex[:6]}")
        if not scenario_config: errors.append({"code": "MISSING_CONFIG", "message": "scenario_config is required."})
        if errors: status = "bad_request"
        else:
            self.logger.info(f"[SIM TOOL] Running async simulation scenario via Tool API: {scenario_name}")
            # --- TODO: Implement real simulation logic ---
            # 1. Fetch current state/parameters via KG Tool API proxy based on context.
            # 2. Apply changes defined in scenario_config.
            # 3. Use MoE/Expert Invoker or FM Client API proxies to predict outcomes.
            # 4. Aggregate predicted metrics.
            await asyncio.sleep(random.uniform(0.2, 0.5)) # Simulate computation
            base_roas = context.get("current_metrics", {}).get("roas", 4.0) # Example context usage
            sim_roas = base_roas * scenario_config.get("sim_cost_multiplier", random.uniform(0.9, 1.2)) # Example effect
            # --- End TODO ---
            status = "success"
            response_payload = {"predicted_roas": sim_roas, "scenario_name": scenario_name, "_info": f"Simulated async outcome"}

        response = { # Manual MIZ OKI response construction
            "miz_oki_version": self.config.miz_oki_schema_version, "request_id": request_id, "trace_id": trace_id,
            "timestamp": datetime.now(datetime.timezone.utc).isoformat(), "source_component": "SimulationTool",
            "target_component": input_data.get("source_component"), "status": status, "payload": response_payload,
            "error_details": errors if errors else None, "metadata": {"processing_duration_ms": (time.monotonic() - start_time) * 1000}
        }
        return response

# --- Ethical Guardrails Tool (Remains Relatively Stable - Internal Use, Sync OK) ---
class EthicalGuardrailsTool:
    """Applies ethical checks to decisions. Internal component, sync logic acceptable."""
    def __init__(self, config: EnhancedConfig):
        self.config = config
        self.checks: Dict[str, List[Callable]] = defaultdict(list) # Store check functions
        self.logger = logging.getLogger('MIZ-OKI.EthicalGuardrailsTool')
        self._load_checks() # Load checks synchronously during init
        self.logger.info("Ethical Guardrails Tool logic initialized.")

    def _load_checks(self):
        """Loads ethical check functions (e.g., based on config or dynamically)."""
        # --- TODO: Load or define specific check functions ---
        # Example check function (sync)
        def check_fairness_basic(decision_type: str, context: Dict, decision: Dict) -> Tuple[bool, str]:
            # Placeholder: Check if decision targets sensitive demographics unfairly
            if decision.get("target_segment") in ["segment_A_sensitive"]:
                return False, "Decision targets sensitive segment potentially unfairly."
            return True, "Fairness check passed (basic)."

        self.checks["all"].append(check_fairness_basic) # Apply to all decisions
        self.checks["ad_targeting"].append(check_fairness_basic) # Apply specifically
        # --- End TODO ---
        self.logger.info(f"Loaded {sum(len(v) for v in self.checks.values())} ethical checks.")

    def review_decision(self, decision_type: str, context: Dict, decision: Dict) -> Dict:
        """Reviews a decision against applicable ethical checks synchronously."""
        results = {"approved": True, "checks_passed": [], "checks_failed": [], "reason": "All checks passed."}
        checks_to_run = self.checks.get("all", []) + self.checks.get(decision_type, [])
        if not checks_to_run: return results # No checks defined

        self.logger.debug(f"Running {len(checks_to_run)} ethical checks for decision type '{decision_type}'.")
        for check_func in checks_to_run:
            try:
                passed, reason = check_func(decision_type, context, decision)
                if passed:
                    results["checks_passed"].append(reason)
                else:
                    results["approved"] = False
                    results["checks_failed"].append(reason)
                    results["reason"] = f"Failed check: {reason}"
                    self.logger.warning(f"Ethical check failed for decision type '{decision_type}': {reason}")
                    break # Stop on first failure
            except Exception as e:
                results["approved"] = False
                fail_reason = f"Exception during check '{check_func.__name__}': {e}"
                results["checks_failed"].append(fail_reason)
                results["reason"] = fail_reason
                self.logger.error(f"Exception during ethical check: {e}", exc_info=True)
                break
        return results

# --- Hybrid Decision Engine Tool (Reworked Async) ---
class HybridDecisionEngineTool:
    """ Makes decisions asynchronously using multiple reasoning modules. Deployed as a service. """
    def __init__(self, config: EnhancedConfig, kg_tool_proxy: Any, moe_registry_proxy: Any, expert_invoker_proxy: Any,
                 ethical_guardrails_tool: EthicalGuardrailsTool, fm_client_proxy: Optional[Any] = None):
        if not all([config, kg_tool_proxy, moe_registry_proxy, expert_invoker_proxy, ethical_guardrails_tool]):
            raise InitializationError("HDE Tool requires config, ethical guardrails, and proxies for KG, MoE Registry, Expert Invoker.")
        self.config = config
        self.kg_tool = kg_tool_proxy
        self.moe_registry = moe_registry_proxy
        self.expert_invoker = expert_invoker_proxy
        self.ethical_guardrails = ethical_guardrails_tool
        self.fm_client = fm_client_proxy # Optional FM client proxy
        self.decision_blueprints: Dict[str, Dict] = {}
        # TODO: Replace deque with persistent storage (e.g., Firestore, BQ via KG Tool API) for production
        self.decision_history = deque(maxlen=5000)
        # Instantiate internal tools
        self.causal_tool = CausalReasoningTool(config, kg_tool_proxy) # Internal instance or could be separate proxy
        self.simulation_tool = SimulationTool(config, kg_tool_proxy, fm_client_proxy) # Internal instance or proxy
        self.logger = logging.getLogger('MIZ-OKI.HybridDecisionEngineTool')
        self._load_blueprints() # Load sync ok
        self.logger.info("Hybrid Decision Engine Tool logic initialized (Reworked).")

    def _create_miz_oki_response(self, request_data: Dict, status: str, payload: Optional[Dict] = None, errors: Optional[List[Dict]] = None) -> Dict:
        """Helper to construct a standard MIZ OKI response."""
        return {
            "miz_oki_version": self.config.miz_oki_schema_version,
            "request_id": request_data.get("request_id"), "trace_id": request_data.get("trace_id"),
            "workflow_execution_id": request_data.get("workflow_execution_id"), "step_id": request_data.get("step_id"),
            "timestamp": datetime.now(datetime.timezone.utc).isoformat(),
            "source_component": "HybridDecisionEngineTool", "target_component": request_data.get("source_component"),
            "status": status, "payload": payload, "error_details": errors, "metadata": {}
        }

    def _load_blueprints(self):
        """ Loads decision blueprints (e.g., from GCS, DB, or embedded). """
        # --- TODO: Load blueprints from a persistent/configurable source ---
        # Example blueprints:
        self.decision_blueprints = {
            "campaign_budget_adjustment": {
                "description": "Adjusts campaign budget based on ROAS prediction and equity score.",
                "reasoning_modules": [
                    {"id": "roas_pred", "type": "model", "task_type": "forecasting", "domain": "roas", "inputs": ["campaign_features"], "output_variable": "predicted_roas"},
                    {"id": "equity_score", "type": "kg_query", "query_template": "GET_EQUITY_QUERY for campaign {campaign_id}", "output_variable": "brand_equity"},
                    {"id": "adjustment_logic", "type": "rule", "rule_func": self._calculate_budget_adjustment_rule, "inputs": ["predicted_roas", "brand_equity", "current_budget"], "output_variable": "budget_adjustment"}
                ],
                "aggregation_logic": "prioritized", # Use output from 'adjustment_logic'
                "output_action_variable": "budget_adjustment"
            },
            "system_optimization": {
                 "description": "Suggests system optimization actions based on current state.",
                 "reasoning_modules": [
                     {"id": "llm_suggest_action", "type": "llama4_reasoning", "model_alias": "llama4_scout", "output_variable": "llm_suggestion", "prompt_template": "Context: {current_metrics}, Failing Objectives: {failing_objectives}. Suggest one optimization action as JSON: {{'action_type': '...', 'params': {{...}}}}"}
                 ],
                 "aggregation_logic": "first_valid", # Take the first valid suggestion
                 "output_action_variable": "llm_suggestion"
            }
            # Add more blueprints...
        }
        self.logger.info(f"Loaded {len(self.decision_blueprints)} decision blueprints (Example).")

    def _calculate_budget_adjustment_rule(self, context: Dict) -> Dict:
        """Example synchronous rule function."""
        pred_roas = context.get("predicted_roas", {}).get("prediction", [3.0])[0]
        equity = context.get("brand_equity", {}).get("results", [{"value": 0.7}])[0].get("value", 0.7)
        current_budget = context.get("current_budget", 100.0)
        roas_target = self.config.business_impact.roas_target

        # Simple logic: increase budget if ROAS > target, decrease if significantly below
        adjustment_factor = 1.0
        if pred_roas > roas_target * 1.1: adjustment_factor = 1.1 # Increase 10%
        elif pred_roas < roas_target * 0.8: adjustment_factor = 0.9 # Decrease 10%

        # Modify based on equity (simple example)
        equity_factor = 1.0 + (equity - 0.7) * 0.2 # +/- up to 20% based on equity deviation from 0.7

        new_budget = max(10.0, current_budget * adjustment_factor * equity_factor) # Min budget 10

        return {"action_type": "set_budget", "params": {"new_budget": round(new_budget, 2)}, "_confidence": 0.85} # Add confidence

    async def _execute_reasoning_module(self, module_config: Dict, context: Dict, all_module_outputs: Dict, trace_id: Optional[str]) -> Dict:
        """ Executes a single reasoning module async, calling deployed Tool APIs proxies. """
        module_type = module_config.get("type")
        module_id = module_config.get("id", f"{module_type}_{uuid.uuid4().hex[:4]}")
        output = {"id": module_id, "type": module_type, "status": "pending"}
        start_time = time.monotonic()
        logger.debug(f"HDE: Executing module: {module_id} ({module_type})")

        # --- Dependency Resolution Helper ---
        def resolve_value(val_str: Any) -> Any:
            if isinstance(val_str, str) and val_str.startswith('{') and val_str.endswith('}'):
                key_path = val_str[1:-1]
                parts = key_path.split('.')
                source_id = parts[0]
                if source_id in all_module_outputs:
                    data = all_module_outputs[source_id]
                    try:
                        for part in parts[1:]:
                            if isinstance(data, dict): data = data.get(part)
                            elif isinstance(data, list) and part.isdigit(): data = data[int(part)]
                            else: data = None; break
                        return data # Return resolved value (could be None)
                    except (KeyError, IndexError, TypeError, AttributeError):
                        return None # Indicate resolution failure
                else: return None # Source module output not found
            return val_str # Return original value if not a placeholder

        # Resolve inputs for the current module
        module_inputs = {k: resolve_value(v) for k, v in module_config.get("inputs", {}).items()}
        module_context = {**context, **module_inputs} # Combine global context and resolved inputs

        try:
            output["status"] = "running"
            if module_type == "model":
                task_type = module_config.get("task_type", "prediction")
                domain = module_config.get("domain")
                expert_id = await self.moe_registry.find_expert_for_task(task_type=task_type, domain=domain, context=module_context) # MoE API Call via proxy
                if not expert_id: raise ValueError(f"No suitable expert found via MoE for task '{task_type}/{domain}'.")
                expert_details = await self.moe_registry.get_expert_details(expert_id) # MoE API Call via proxy
                expert_endpoint = expert_details.get("endpoint") if expert_details else None
                if not expert_endpoint or not self.expert_invoker: raise ValueError(f"Expert endpoint for '{expert_id}' not found or invoker proxy missing.")

                # Prepare input data based on module config (using resolved values)
                model_input_data = {k: module_context.get(k) for k in module_config.get("model_input_keys", [])}
                invoker_request = {"payload": {"endpoint": expert_endpoint, "data": model_input_data}, "trace_id": trace_id}
                invoker_response = await self.expert_invoker.invoke(request=invoker_request) # Expert Invoker API Call via proxy

                if invoker_response.get("status") == "success":
                    output["result"] = invoker_response.get("payload", {}) # Store entire payload from expert
                    output["expert_id"] = expert_id
                else: raise RuntimeError(f"Expert Invoker API call failed: {invoker_response.get('error_details')}")

            elif module_type == "rule":
                 rule_func = module_config.get("rule_func")
                 if callable(rule_func):
                     # Run sync rule in thread pool
                     output["result"] = await asyncio.to_thread(rule_func, module_context)
                 else: raise TypeError("Rule function not callable.")

            elif module_type == "causal":
                 causal_query = {k: module_context.get(k) for k in module_config.get("query_keys", [])}
                 causal_request = {"payload": {"effect_query": causal_query, "context": module_context}, "trace_id": trace_id}
                 causal_response = await self.causal_tool.estimate_effect(input_data=causal_request) # Call Causal Tool API proxy
                 if causal_response.get("status") == "success": output["result"] = causal_response.get("payload", {})
                 else: raise RuntimeError(f"Causal Tool API call failed: {causal_response.get('error_details')}")

            elif module_type == "simulation":
                 sim_config = {k: module_context.get(k) for k in module_config.get("config_keys", [])}
                 sim_request = {"payload": {"scenario_config": sim_config, "context": module_context}, "trace_id": trace_id}
                 sim_response = await self.simulation_tool.run_scenario(input_data=sim_request) # Call Sim Tool API proxy
                 if sim_response.get("status") == "success": output["result"] = sim_response.get("payload", {})
                 else: raise RuntimeError(f"Simulation Tool API call failed: {sim_response.get('error_details')}")

            elif module_type == "llama4_reasoning" or module_type == "fm_generate": # Unified type
                if not self.fm_client: raise RuntimeError("FM Client Tool proxy unavailable.")
                prompt_template = module_config.get("prompt_template")
                model_alias = module_config.get("model_alias", self.config.foundation_models.defaults.llama4_maverick)
                output_var = module_config.get("output_variable", "llm_output") # Variable name to store result in

                # Format prompt safely using combined context and previous outputs
                template_context = {**module_context, **all_module_outputs}
                prompt = prompt_template.format_map(defaultdict(lambda: 'N/A', template_context))

                fm_request = {"payload": {"prompt": prompt, "model_alias": model_alias, "max_tokens": module_config.get("max_tokens", 512)}, "trace_id": trace_id}
                fm_response = await self.fm_client.generate_text(input_data=fm_request) # FM Client API Call via proxy

                if fm_response.get("status") == "success":
                    raw_output = fm_response.get("payload", {}).get("generated_text")
                    # Attempt to parse if JSON output is expected
                    try: output["result"] = json.loads(raw_output); output["parsed_json"] = True
                    except (json.JSONDecodeError, TypeError): output["result"] = {"raw_output": raw_output}; output["parsed_json"] = False
                    output["model_alias"] = model_alias
                else: raise RuntimeError(f"FM Client API call failed: {fm_response.get('error_details')}")

            elif module_type == "kg_query": # Added KG Query module
                 if not self.kg_tool: raise RuntimeError("KG Tool proxy unavailable.")
                 query_template = module_config.get("query_template")
                 output_var = module_config.get("output_variable", "kg_results")
                 query_params = {k: module_context.get(k) for k in module_config.get("param_keys", [])}
                 query = query_template.format_map(defaultdict(lambda: 'N/A', module_context)) # Format query string

                 kg_request = {"payload": {"query": query, "parameters": query_params}, "trace_id": trace_id}
                 kg_response = await self.kg_tool.execute_query(request=kg_request) # KG Tool API Call via proxy
                 if kg_response.get("status") == "success":
                     output["result"] = kg_response.get("payload", {}).get("results", [])
                 else: raise RuntimeError(f"KG Tool API call failed: {kg_response.get('error_details')}")

            else:
                output["error"] = f"Unsupported module type: {module_type}"; output["status"] = "failed"

            if "error" not in output: output["status"] = "success"

        except Exception as mod_e:
            output["error"] = str(mod_e); output["status"] = "failed"
            self.logger.error(f"HDE Module {module_id} error async: {mod_e}", exc_info=False) # Log error but don't crash HDE

        output["duration_ms"] = (time.monotonic() - start_time) * 1000
        return output

    def _aggregate_outputs(self, module_outputs: Dict[str, Dict], logic: str) -> Tuple[Optional[Dict], float, Optional[str], Optional[str]]:
        """ Aggregates outputs from multiple reasoning modules based on specified logic (sync). """
        if not module_outputs: return None, 0.0, None, None

        successful_outputs = {mod_id: output for mod_id, output in module_outputs.items() if output.get("status") == "success" and output.get("result") is not None}
        if not successful_outputs: return None, 0.0, None, None # No successful modules

        if logic == "prioritized":
            # Assumes blueprints define priority or uses order
            # Or looks for a specific output variable name defined in the blueprint
            output_var_name = self.decision_blueprints.get(self._current_decision_type, {}).get("output_action_variable") # Need to track current type
            for mod_id, output in successful_outputs.items():
                 if output_var_name and output_var_name in output.get("result", {}):
                      decision = output["result"][output_var_name]
                      confidence = output.get("result", {}).get("_confidence", output.get("confidence", 0.7)) # Get confidence if provided
                      return decision, confidence, mod_id, output.get("type")
            # Fallback: return first successful result if specific variable not found
            first_id = list(successful_outputs.keys())[0]
            first_output = successful_outputs[first_id]
            return first_output.get("result"), first_output.get("confidence", 0.7), first_id, first_output.get("type")

        elif logic == "average": # Example for numerical outputs
            values = [output["result"] for output in successful_outputs.values() if isinstance(output.get("result"), (int, float))]
            confidences = [output.get("confidence", 0.7) for output in successful_outputs.values() if isinstance(output.get("result"), (int, float))]
            if not values: return None, 0.0, None, None
            avg_value = np.average(values, weights=confidences if len(confidences) == len(values) else None)
            avg_confidence = np.mean(confidences) if confidences else 0.7
            return {"aggregated_value": avg_value}, avg_confidence, "aggregation", "average"

        elif logic == "majority_vote": # Example for classification outputs
            votes = [output["result"].get("class") for output in successful_outputs.values() if isinstance(output.get("result"), dict) and "class" in output["result"]]
            if not votes: return None, 0.0, None, None
            majority_class = Counter(votes).most_common(1)[0][0]
            # Confidence could be proportion of votes
            confidence = votes.count(majority_class) / len(votes)
            return {"class": majority_class}, confidence, "aggregation", "majority_vote"

        elif logic == "first_valid": # Return the first successful module's output
            first_id = list(successful_outputs.keys())[0]
            first_output = successful_outputs[first_id]
            # Extract confidence if available in the result dict itself
            confidence = first_output.get("result", {}).get("_confidence", first_output.get("confidence", 0.7))
            return first_output.get("result"), confidence, first_id, first_output.get("type")

        else: # Default to first valid
            self.logger.warning(f"Unsupported aggregation logic '{logic}', defaulting to 'first_valid'.")
            first_id = list(successful_outputs.keys())[0]; first_output = successful_outputs[first_id]
            confidence = first_output.get("result", {}).get("_confidence", first_output.get("confidence", 0.7))
            return first_output.get("result"), confidence, first_id, first_output.get("type")

    async def make_decision(self, input_data: Dict[str, Any]) -> Dict[str, Any]:
        """ Makes a decision asynchronously. Expects MIZ OKI input, returns MIZ OKI response. """
        start_time = time.monotonic(); errors = []
        # Parse MIZ OKI input
        payload = input_data.get("payload", {}); decision_type = payload.get("decision_type"); context = payload.get("context", {})
        trace_id = input_data.get("trace_id"); request_id = input_data.get("request_id")

        # Prepare MIZ OKI response structure
        response = self._create_miz_oki_response(input_data, "pending")
        decision_id = f"dec_{decision_type or 'unknown'}_{uuid.uuid4().hex[:8]}"
        log_entry = { # Detailed log for history and XAI
            "decision_id": decision_id, "decision_type": decision_type, "trace_id": trace_id,
            "timestamp_start": response["timestamp"], "context_preview": str(context)[:250], # Limit preview size
            "status": "pending", "module_outputs": {}, "final_decision": None,
            "final_confidence": 0.0, "chain_of_thought": [], "ethical_review": None,
            "action_recommended": False
        }
        cot = log_entry["chain_of_thought"]; all_module_outputs = {}
        self._current_decision_type = decision_type # Track for aggregation logic

        try:
            if not decision_type or decision_type not in self.decision_blueprints:
                raise ValueError(f"Decision blueprint '{decision_type}' not found or not specified.")
            blueprint = self.decision_blueprints[decision_type]
            cot.append(f"[{datetime.now(datetime.timezone.utc).isoformat()}] Start HDE decision '{decision_type}'. Blueprint: {blueprint.get('description', 'N/A')}")

            # Execute Modules async
            cot.append(f"[{datetime.now(datetime.timezone.utc).isoformat()}] Executing {len(blueprint.get('reasoning_modules', []))} reasoning modules async...")
            module_tasks = [
                self._execute_reasoning_module(mod_cfg, context, all_module_outputs, trace_id)
                for mod_cfg in blueprint.get("reasoning_modules", [])
            ]
            module_results_list = await asyncio.gather(*module_tasks, return_exceptions=True)

            # Process module results
            errors_found_in_modules = False
            for result in module_results_list:
                 if isinstance(result, Exception):
                      errors_found_in_modules = True; cot.append(f"  - ERROR during module execution: {result}")
                      # Log error but don't necessarily stop the whole decision process
                      logger.error(f"HDE {decision_id}: Exception in reasoning module: {result}", exc_info=True)
                      continue
                 if isinstance(result, dict) and (mod_id := result.get("id")):
                      all_module_outputs[mod_id] = result # Store full module output
                      status_indicator = "OK" if result.get("status") == "success" else f"FAILED ({result.get('error')})"
                      cot.append(f"  - Module {mod_id} ({result.get('type')}) finished. Status: {status_indicator}. Duration: {result.get('duration_ms'):.2f} ms")
                      if result.get("status") != "success": errors_found_in_modules = True
                 else:
                      cot.append(f"  - WARNING: Received invalid result type from a module: {type(result)}")
                      errors_found_in_modules = True
            log_entry["module_outputs"] = all_module_outputs
            cot.append(f"[{datetime.now(datetime.timezone.utc).isoformat()}] Reasoning modules execution attempt complete. Errors found: {errors_found_in_modules}")

            # Aggregation (Sync)
            agg_logic = blueprint.get("aggregation_logic", "prioritized")
            cot.append(f"[{datetime.now(datetime.timezone.utc).isoformat()}] Aggregating module outputs using logic: '{agg_logic}'...")
            agg_decision, agg_conf, src_mod_id, src_mod_type = self._aggregate_outputs(all_module_outputs, agg_logic)

            if agg_decision is None:
                raise ValueError(f"Aggregation failed ('{agg_logic}'). No valid decision could be determined from module outputs.")
            cot.append(f"  - Aggregated decision from module '{src_mod_id}' ({src_mod_type}). Confidence: {agg_conf:.4f}")
            log_entry["aggregated_decision"] = agg_decision; log_entry["aggregated_confidence"] = agg_conf

            # Ethical Guardrails (Sync - as it's internal logic)
            cot.append(f"[{datetime.now(datetime.timezone.utc).isoformat()}] Performing ethical review...")
            ethical_review = self.ethical_guardrails.review_decision(decision_type, context, agg_decision)
            log_entry["ethical_review"] = ethical_review
            cot.append(f"  - Ethical review complete. Approved: {ethical_review.get('approved')}. Reason: {ethical_review.get('reason')}")

            # Final Decision & Actionability (Sync)
            final_decision = agg_decision; final_confidence = agg_conf
            if not ethical_review.get("approved", False):
                # Modify decision or flag it if ethics check fails
                final_decision["ethics_flag"] = {"failed_checks": ethical_review.get("checks_failed"), "reason": ethical_review.get("reason")}
                final_confidence *= 0.8 # Reduce confidence if flagged
                log_entry["status"] = "ethics_review_required"
                cot.append(f"  - Decision flagged due to ethical concerns.")
            else:
                log_entry["status"] = "approved_by_engine"
                cot.append(f"  - Ethical review passed.")

            log_entry["final_decision"] = final_decision; log_entry["final_confidence"] = final_confidence
            cot.append(f"[{datetime.now(datetime.timezone.utc).isoformat()}] Final decision compiled.")

            # Determine if action should be recommended based on confidence threshold
            min_confidence = self.config.system_thresholds.decision_confidence_threshold
            # Check if final_decision represents a concrete action (not 'no_action' or empty)
            is_actionable = isinstance(final_decision, dict) and final_decision.get('action_type') not in ['no_action', None, '']
            should_recommend = is_actionable and final_confidence >= min_confidence and log_entry["status"] != "ethics_review_required"
            log_entry["action_recommended"] = should_recommend
            cot.append(f"  - Action Recommended: {should_recommend} (Confidence: {final_confidence:.4f} vs Threshold: {min_confidence:.4f})")

            response["status"] = "success" # HDE process completed successfully (even if action not recommended or ethics flagged)
            response["payload"] = log_entry # Return the detailed log entry

        except Exception as e:
             logger.error(f"Async decision making FAILED for '{decision_type}' (ID: {decision_id}): {e}", exc_info=True)
             response["status"] = "internal_error"; errors.append({"code": "HDE_ERROR", "message": str(e)})
             log_entry["status"] = "failed"; log_entry["error"] = str(e); cot.append(f"[{datetime.now(datetime.timezone.utc).isoformat()}] PROCESS FAILED: {e}")
             response["payload"] = log_entry # Return log even on failure

        log_entry["total_duration_ms"] = (time.monotonic() - start_time) * 1000
        self.decision_history.append(log_entry) # Add to in-memory history

        # Persist decision log async via KG Tool API proxy (fire-and-forget)
        if self.kg_tool and hasattr(self.kg_tool, 'save_decision_record'):
             # Prepare MIZ OKI request for KG Tool
             kg_request = {"payload": {"record": log_entry}, "trace_id": trace_id, "request_id": f"kg_save_dec_{decision_id}"}
             asyncio.create_task(self.kg_tool.save_decision_record(request=kg_request), name=f"save_xai_{decision_id}")
        else:
             logger.warning(f"KG Tool proxy unavailable. Cannot persist decision log {decision_id} to KG.")

        response["metadata"]["processing_duration_ms"] = log_entry["total_duration_ms"]
        if errors: response["error_details"] = errors
        return response

    # --- Add get_history and update_decision_log methods (conceptual, need persistence) ---
    async def get_history(self, request: Dict) -> Dict: # Expects MIZ OKI
        """ Retrieves recent decision history. Needs persistent store integration. """
        start_time = time.monotonic(); errors = []
        payload = request.get("payload", {}); limit = payload.get("limit", 100)
        status = "pending"; response_payload = None
        try:
            # --- TODO: Implement retrieval from persistent store (KG/DB) ---
            # Example using in-memory deque:
            history_list = list(self.decision_history)[-limit:]
            # --- End TODO ---
            status = "success"; response_payload = {"history": history_list}
        except Exception as e:
            status = "internal_error"; errors.append({"code": "HISTORY_ERROR", "message": str(e)})
            logger.error(f"Error retrieving HDE history: {e}", exc_info=True)

        response = self._create_miz_oki_response(request, status, response_payload, errors if errors else None)
        response["metadata"]["processing_duration_ms"] = (time.monotonic() - start_time) * 1000
        return response

    async def update_decision_log(self, request: Dict) -> Dict: # Expects MIZ OKI
        """ Updates a decision log, e.g., with human review status. Needs persistent store integration. """
        start_time = time.monotonic(); errors = []
        payload = request.get("payload", {}); decision_id = payload.get("decision_id"); update_data = payload.get("update_data")
        status = "pending"; response_payload = None

        if not decision_id or not update_data: errors.append({"code": "MISSING_DATA", "message": "'decision_id' and 'update_data' required."})
        if errors: status = "bad_request"
        else:
            try:
                # --- TODO: Implement update logic for persistent store (KG/DB) ---
                # Example: Find log in DB/KG and update fields.
                # kg_update_request = {"payload": {"decision_id": decision_id, "updates": update_data}, ...}
                # kg_response = await self.kg_tool.update_decision_log_endpoint(request=kg_update_request) # Needs KG endpoint
                # success = kg_response.get("status") == "success"

                # Placeholder using in-memory deque:
                log_entry = next((item for item in self.decision_history if item.get('decision_id') == decision_id), None)
                success = False
                if log_entry:
                    log_entry.update(update_data)
                    log_entry["last_updated_by_api"] = datetime.now(datetime.timezone.utc).isoformat()
                    success = True
                    logger.info(f"Updated decision log {decision_id} in memory.")
                else:
                    logger.warning(f"Decision log {decision_id} not found in memory for update.")
                # --- End TODO ---

                if success: status = "success"; response_payload = {"updated": True, "decision_id": decision_id}
                else: status = "not_found"; errors.append({"code": "LOG_NOT_FOUND", "message": f"Decision log {decision_id} not found for update."})

            except Exception as e:
                status = "internal_error"; errors.append({"code": "UPDATE_ERROR", "message": str(e)})
                logger.error(f"Error updating decision log {decision_id}: {e}", exc_info=True)

        response = self._create_miz_oki_response(request, status, response_payload, errors if errors else None)
        response["metadata"]["processing_duration_ms"] = (time.monotonic() - start_time) * 1000
        return response


# --- Learning Integration Tool (Reworked Async) ---
class LearningIntegrationTool:
    """ Manages async integration of learning insights, triggers MLOps via Pub/Sub client proxy. Deployed as a service. """
    def __init__(self, kg_tool_proxy: Any, moe_registry_proxy: Any, expert_invoker_proxy: Any, pubsub_client_proxy: Any, config: EnhancedConfig):
        # Inject dependencies
        if not all([config, kg_tool_proxy, moe_registry_proxy, expert_invoker_proxy, pubsub_client_proxy]):
            raise InitializationError("LearningIntegrationTool requires config and proxies for KG, MoE, Expert Invoker, and PubSub.")
        self.kg_tool = kg_tool_proxy
        self.moe_registry = moe_registry_proxy
        self.expert_invoker = expert_invoker_proxy
        self.pubsub_client = pubsub_client_proxy
        self.config = config
        # TODO: Replace deque with persistent storage for production
        self.learning_history = deque(maxlen=5000)
        self.integration_rules = defaultdict(dict) # Rules for how to handle different knowledge types
        self.bias_detectors = [] # Placeholder for bias detection functions/modules
        self.mlops_trigger_topic_name = config.mlops_trigger_topic
        self.project = config.gcp.project_id
        self.location = config.gcp.region
        self.logger = logging.getLogger('MIZ-OKI.LearningIntegrationTool')
        self._load_rules() # Load rules synchronously
        self.logger.info("Learning Integration Tool logic initialized (Reworked).")

    def _create_miz_oki_response(self, request_data: Dict, status: str, payload: Optional[Dict] = None, errors: Optional[List[Dict]] = None) -> Dict:
        """Helper to construct a standard MIZ OKI response."""
        return {
            "miz_oki_version": self.config.miz_oki_schema_version,
            "request_id": request_data.get("request_id"), "trace_id": request_data.get("trace_id"),
            "workflow_execution_id": request_data.get("workflow_execution_id"), "step_id": request_data.get("step_id"),
            "timestamp": datetime.now(datetime.timezone.utc).isoformat(),
            "source_component": "LearningIntegrationTool", "target_component": request_data.get("source_component"),
            "status": status, "payload": payload, "error_details": errors, "metadata": {}
        }

    def _load_rules(self):
        """Loads integration rules (e.g., from config file, DB)."""
        # --- TODO: Load rules from a persistent/configurable source ---
        # Example rules:
        self.integration_rules['feedback_batch'] = [
            {"action": "update_kg", "params": {"label": "Feedback"}},
            {"action": "retrain_expert", "params": {"expert_domain": "sentiment_analysis"}, "condition": "sentiment_negative_high"}
        ]
        self.integration_rules['validation_alert'] = [
            {"action": "update_kg", "params": {"label": "SystemAlert"}},
            {"action": "retrain_expert", "params": {"expert_domain": "drift_detection_model"}, "condition": "drift_detected"}
        ]
        # --- End TODO ---
        self.logger.info(f"Loaded {sum(len(v) for v in self.integration_rules.values())} integration rules.")

    async def _run_bias_checks(self, knowledge_data: Any, source: str) -> Tuple[List[Dict], bool]:
        """Placeholder for running bias detection checks."""
        # --- TODO: Implement bias detection logic ---
        # - Requires access to sensitive attributes and model predictions.
        # - Calculate fairness metrics.
        # --- End TODO ---
        logger.debug(f"LI: Running bias checks for source '{source}' (Placeholder).")
        await asyncio.sleep(random.uniform(0.02, 0.08)) # Simulate check time
        bias_found = random.random() < 0.03 # Simulate 3% chance
        checks_log = [{"check": "demographic_parity", "result": "passed" if not bias_found else "failed"}]
        return checks_log, bias_found

    async def integrate_learning(self, input_data: Dict[str, Any]) -> Dict[str, Any]:
        """ Integrates learning insights/feedback asynchronously. Expects/Returns MIZ OKI payload. """
        start_time = time.monotonic(); errors = []
        # Parse MIZ OKI input
        payload = input_data.get("payload", {}); knowledge_type = payload.get("knowledge_type"); knowledge_data = payload.get("knowledge_data"); source = payload.get("source"); importance = payload.get("importance", 0.5)
        trace_id = input_data.get("trace_id"); request_id = input_data.get("request_id")

        # Prepare MIZ OKI response structure
        response = self._create_miz_oki_response(input_data, "pending")
        integration_id = f"li_{knowledge_type or 'unknown'}_{uuid.uuid4().hex[:8]}"
        log_entry = { # Detailed log
            "integration_id": integration_id, "trace_id": trace_id,
            "timestamp": response["timestamp"], "knowledge_type": knowledge_type,
            "source": source, "importance": importance,
            "input_preview": str(knowledge_data)[:250], # Limit preview size
            "status": "pending", "bias_checks": [], "actions_taken": [], "triggered_messages": []
        }

        try:
            if not knowledge_type or knowledge_data is None:
                raise ValueError("Missing 'knowledge_type' or 'knowledge_data' in payload.")

            # 1. Bias Checks (Optional)
            bias_checks_log, bias_found = await self._run_bias_checks(knowledge_data, source)
            log_entry["bias_checks"] = bias_checks_log
            if bias_found:
                log_entry["bias_mitigation"] = "Flagged" # Or trigger specific mitigation action
                logger.warning(f"LI {integration_id}: Potential bias detected in knowledge from '{source}'.")
                # Decide whether to proceed or halt based on policy

            # 2. Determine Actions based on Rules
            actions_to_take = []
            rules_for_type = self.integration_rules.get(knowledge_type, [])
            if rules_for_type:
                logger.debug(f"LI {integration_id}: Applying {len(rules_for_type)} rules for type '{knowledge_type}'.")
                for rule in rules_for_type:
                    condition = rule.get("condition")
                    # --- TODO: Implement condition evaluation logic ---
                    # Example: Check if 'sentiment_negative_high' flag is set in knowledge_data
                    condition_met = True # Placeholder
                    if condition:
                         # condition_met = evaluate_condition(condition, knowledge_data, log_entry)
                         pass
                    # --- End TODO ---
                    if condition_met:
                        actions_to_take.append(rule) # Add rule (which contains action type and params)
            else:
                logger.info(f"LI {integration_id}: No specific integration rules found for type '{knowledge_type}'. Default actions might apply.")
                # Optionally define default actions, e.g., always log to KG
                actions_to_take.append({"action": "update_kg", "params": {"label": knowledge_type or "GenericKnowledge"}})

            # 3. Execute Actions Asynchronously
            if actions_to_take:
                action_tasks = [self._execute_action(action, integration_id, log_entry) for action in actions_to_take]
                action_results = await asyncio.gather(*action_tasks, return_exceptions=True)
                # Process results, update log_entry
                processed_action_results = []
                for i, res in enumerate(action_results):
                    action_info = actions_to_take[i]
                    if isinstance(res, Exception):
                        action_info["result"] = {"status": "error", "error": str(res)}
                        logger.error(f"LI Action '{action_info.get('action')}' failed: {res}")
                    elif isinstance(res, dict):
                        action_info["result"] = res
                    processed_action_results.append(action_info)
                log_entry["actions_taken"] = processed_action_results
            else:
                logger.info(f"LI {integration_id}: No LI actions determined for '{knowledge_type}'.")

            response["status"] = "success"
            log_entry["status"] = "success"
            response["payload"] = log_entry # Return the detailed log as payload

        except Exception as e:
             logger.error(f"Async LI FAILED for '{knowledge_type}' ({integration_id}): {e}", exc_info=True)
             response["status"] = "internal_error"; errors.append({"code": "LI_ERROR", "message": str(e)})
             log_entry["status"] = "failed"; log_entry["error"] = str(e)
             response["payload"] = log_entry # Return log even on failure

        log_entry["total_duration_ms"] = (time.monotonic() - start_time) * 1000
        self.learning_history.append(log_entry) # Add to in-memory history
        response["metadata"]["processing_duration_ms"] = log_entry["total_duration_ms"]
        if errors: response["error_details"] = errors
        # --- TODO: Persist log_entry async via KG Tool API or logging service ---
        return response

    async def _execute_action(self, action_rule: Dict, integration_id: str, log_entry_ref: Dict) -> Dict:
        """ Executes LI action, potentially triggering MLOps via Pub/Sub client proxy using MIZ OKI format. """
        action_type = action_rule.get("action")
        action_params = action_rule.get("params", {})
        action_log = {"action_type": action_type, "params": action_params, "status": "pending"}
        start_time = time.monotonic()
        self.logger.info(f"LI Action ({integration_id}): Executing '{action_type}'...")

        try:
            if action_type == "update_kg":
                # --- TODO: Implement KG update via KG Tool API Proxy ---
                # kg_request = {"payload": {"entities": [...], "relationships": [...]}, "trace_id": log_entry_ref.get("trace_id")}
                # kg_response = await self.kg_tool.add_entities_bulk_endpoint(request=kg_request)
                # if kg_response.get("status") == "success": action_log["status"] = "success"
                # else: action_log["status"] = "failed"; action_log["error"] = kg_response.get("error_details")
                action_log["status"] = "success"; action_log["details"] = "KG update executed (Simulated)."
                await asyncio.sleep(0.05) # Simulate KG call
                # --- End TODO ---
            elif action_type == "retrain_expert" or action_type == "finetune_llm":
                if not self.pubsub_client: raise RuntimeError("PubSub client proxy unavailable.")
                if not self.project: raise ConfigurationError("GCP Project ID not configured.")

                pipeline_name = action_params.get("pipeline_name", "miz3_expert_training_pipeline_v1deploy_apireg" if action_type=="retrain_expert" else "miz3_llm_finetuning_pipeline")
                # Prepare pipeline parameters, merging rule params with defaults/context
                mlops_pipeline_params = {
                    "project": self.project,
                    "location": self.location,
                    "trigger_source": f"li:{integration_id}",
                    "timestamp_trigger": log_entry_ref["timestamp"],
                    **action_params # Include params from the rule (e.g., model_id, dataset_uri)
                }
                # Ensure required params for the specific pipeline are present
                if "model_display_name_prefix" not in mlops_pipeline_params and "model_id_to_retrain" not in mlops_pipeline_params:
                     mlops_pipeline_params["model_display_name_prefix"] = f"li_retrained_{action_params.get('expert_domain', 'model')}"

                # Prepare MIZ OKI Pub/Sub Message
                message_data = {
                    "miz_oki_version": self.config.miz_oki_schema_version,
                    "event_type": "trigger_mlops_pipeline",
                    "payload": {"pipeline_name": pipeline_name, "parameters": mlops_pipeline_params},
                    "metadata": {"trace_id": log_entry_ref.get("trace_id"), "source_component": "LearningIntegrationTool", "li_integration_id": integration_id}
                }
                message_bytes = json.dumps(message_data).encode('utf-8')
                mlops_topic_full_path = f"projects/{self.project}/topics/{self.mlops_trigger_topic_name}"

                # Call Pub/Sub Client Proxy method
                message_id = await self.pubsub_client.publish(mlops_topic_full_path, message_bytes)

                action_log["status"] = "training_triggered"; action_log["pipeline_name"] = pipeline_name; action_log["message_id"] = message_id
                log_entry_ref["triggered_messages"].append(message_id) # Log triggered message ID
            else:
                action_log["status"] = "skipped"; action_log["reason"] = f"Unsupported LI action type: {action_type}"
                logger.warning(f"LI Action ({integration_id}): Unsupported action type '{action_type}'.")

        except Exception as exec_e:
            logger.error(f"Error executing async LI action '{action_type}': {exec_e}", exc_info=True)
            action_log["status"] = "error"; action_log["error"] = str(exec_e)

        action_log["duration_ms"] = (time.monotonic() - start_time) * 1000
        self.logger.info(f"LI Action ({integration_id}): Finished '{action_type}' with status '{action_log['status']}'.")
        return action_log

# --- Holistic Optimizer Tool (Reworked Async) ---
class HolisticOptimizerTool:
    """ Optimizes overall system performance asynchronously based on objectives. Deployed as a service. """
    def __init__(self, config: EnhancedConfig, kg_tool_proxy: Any, hde_tool_proxy: Any, moe_registry_proxy: Any, expert_invoker_proxy: Any):
        # Inject dependencies
        if not all([config, kg_tool_proxy, hde_tool_proxy, moe_registry_proxy, expert_invoker_proxy]):
            raise InitializationError("HolisticOptimizerTool requires config and proxies for KG, HDE, MoE Registry, and Expert Invoker.")
        self.config = config
        self.kg_tool = kg_tool_proxy
        self.hde_tool = hde_tool_proxy
        self.moe_registry = moe_registry_proxy
        self.expert_invoker = expert_invoker_proxy
        self.objectives: Dict[str, Dict] = {}
        self.targets: Dict[str, float] = {}
        self.baselines: Dict[str, float] = {}
        # TODO: Replace deque with persistent storage (e.g., Timeseries DB, BQ) for production
        self.metric_history = defaultdict(lambda: deque(maxlen=1000))
        self.forecasting_models: Dict[str, str] = {}
        # TODO: Replace deque with persistent storage
        self.optimization_history = deque(maxlen=500)
        self.logger = logging.getLogger('MIZ-OKI.HolisticOptimizerTool')
        self._load_objectives_from_config() # Sync load ok
        self.logger.info("Holistic Optimizer Tool logic initialized (Reworked).")

    def _create_miz_oki_response(self, request_data: Dict, status: str, payload: Optional[Dict] = None, errors: Optional[List[Dict]] = None) -> Dict:
        """Helper to construct a standard MIZ OKI response."""
        return {
            "miz_oki_version": self.config.miz_oki_schema_version,
            "request_id": request_data.get("request_id"), "trace_id": request_data.get("trace_id"),
            "workflow_execution_id": request_data.get("workflow_execution_id"), "step_id": request_data.get("step_id"),
            "timestamp": datetime.now(datetime.timezone.utc).isoformat(),
            "source_component": "HolisticOptimizerTool", "target_component": request_data.get("source_component"),
            "status": status, "payload": payload, "error_details": errors, "metadata": {}
        }

    def _load_objectives_from_config(self):
        """Loads objectives, targets, baselines, and forecasters from the main config object."""
        try:
            self.objectives = self.config.get("optimizer_objectives", {})
            kpis_config = self.config.get("business_impact.kpis", {})
            self.targets = {kpi: data.get('target') for kpi, data in kpis_config.items() if 'target' in data and isinstance(data.get('target'), (int, float))}
            self.baselines = {kpi: data.get('baseline', 0) for kpi, data in kpis_config.items() if isinstance(data.get('baseline'), (int, float))}
            self.forecasting_models = self.config.get("optimizer_forecasting_models", {})
            self.logger.info(f"PO: Loaded {len(self.objectives)} objectives, {len(self.targets)} targets, {len(self.forecasting_models)} forecasters.")
        except Exception as e:
            self.logger.error(f"PO: Failed to load objectives from config: {e}", exc_info=True)
            # Initialize with empty dicts to prevent errors later
            self.objectives = {}; self.targets = {}; self.baselines = {}; self.forecasting_models = {}

    def update_metric(self, metric_name: str, value: float, timestamp_iso: Optional[str] = None):
        """Updates the history for a given metric. Needs persistent storage."""
        # --- TODO: Implement persistent storage update ---
        # Example: Write to BigQuery or Timeseries DB
        # For now, using in-memory deque placeholder
        try:
            ts = timestamp_iso or datetime.now(datetime.timezone.utc).isoformat()
            metric_value = float(value)
            if not np.isnan(metric_value) and not np.isinf(metric_value):
                self.metric_history[metric_name].append({"timestamp": ts, "value": metric_value})
                # logger.debug(f"PO: Updated metric '{metric_name}' with value {metric_value}. History size: {len(self.metric_history[metric_name])}")
            else:
                 logger.warning(f"PO: Received invalid value (NaN/Inf) for metric '{metric_name}'. Ignoring.")
        except (ValueError, TypeError) as e:
            logger.error(f"PO: Failed to update metric '{metric_name}' with value '{value}': {e}")
        # --- End TODO ---

    async def _predict_metric_value(self, metric_name: str, trace_id: Optional[str], horizon_steps: int = 1) -> Optional[float]:
        """ Predicts metric value using MoE Invoker API proxy. """
        expert_alias = self.forecasting_models.get(metric_name)
        if not expert_alias:
            logger.warning(f"PO: No forecasting model configured for metric '{metric_name}'.")
            return None
        if not self.moe_registry or not self.expert_invoker:
            logger.error("PO: MoE Registry or Expert Invoker proxy unavailable for prediction.")
            return None

        try:
            # Find expert via MoE Registry API proxy
            expert_id = await self.moe_registry.find_expert_for_task(task_type="forecasting", domain=metric_name)
            if not expert_id:
                logger.warning(f"PO: Could not find forecasting expert via MoE for '{metric_name}'.")
                return None
            expert_details = await self.moe_registry.get_expert_details(expert_id)
            expert_endpoint = expert_details.get("endpoint") if expert_details else None
            if not expert_endpoint:
                logger.warning(f"PO: Endpoint not found for forecasting expert '{expert_id}'.")
                return None

            # --- TODO: Fetch sufficient history from persistent store ---
            # Example using in-memory deque:
            recent_history = [h['value'] for h in list(self.metric_history.get(metric_name, []))[-20:]] # Get last 20 points
            # --- End TODO ---
            if len(recent_history) < 5: # Need minimum history for forecast
                logger.warning(f"PO: Insufficient history ({len(recent_history)} points) for forecasting '{metric_name}'.")
                return None

            # Prepare input for the forecasting model
            input_data_payload = {"historical_values": recent_history, "steps_to_predict": horizon_steps}

            # Call Expert Invoker API proxy
            invoker_request = {
                "payload": {"endpoint": expert_endpoint, "data": input_data_payload},
                "trace_id": trace_id, "request_id": f"invoker_predict_{metric_name}_{trace_id or uuid.uuid4().hex[:6]}"
            }
            invoker_response = await self.expert_invoker.invoke(request=invoker_request) # Call API proxy

            if invoker_response.get("status") == "success":
                result_payload = invoker_response.get("payload", {})
                # --- TODO: Adapt parsing based on actual forecaster output format ---
                prediction = result_payload.get("prediction")
                if isinstance(prediction, list) and len(prediction) >= horizon_steps:
                    return float(prediction[horizon_steps - 1])
                # --- End TODO ---
                else: logger.warning(f"PO: Forecast expert '{expert_id}' returned invalid format or insufficient steps: {prediction}")
            else:
                logger.warning(f"PO: Expert Invoker API call failed for forecast expert '{expert_id}': {invoker_response.get('error_details')}")

            return None
        except Exception as e:
            logger.error(f"PO: Async forecast via invoker proxy failed for expert '{expert_alias}': {e}", exc_info=True)
            return None

    def _evaluate_objectives(self, metric_state: Dict[str, float]) -> Dict[str, float]:
        """Evaluates objectives based on current/predicted metrics (sync logic ok)."""
        objective_scores = {}
        if not self.objectives: return {}
        for obj_id, obj_data in self.objectives.items():
            target = obj_data.get('target')
            baseline = obj_data.get('baseline', 0)
            metric_name = obj_data.get('metric') # Assuming simple 1 metric per objective for now
            lower_is_better = obj_data.get('lower_is_better', False)

            if metric_name and metric_name in metric_state and target is not None:
                current_value = metric_state[metric_name]
                try:
                    target_f = float(target); baseline_f = float(baseline)
                    # Normalize score between 0 (at baseline) and 1 (at or beyond target)
                    if target_f == baseline_f: score = 1.0 if (lower_is_better and current_value <= target_f) or (not lower_is_better and current_value >= target_f) else 0.0
                    elif lower_is_better: score = (baseline_f - current_value) / (baseline_f - target_f)
                    else: score = (current_value - baseline_f) / (target_f - baseline_f)
                    objective_scores[obj_id] = max(0.0, min(1.0, score)) # Clamp score between 0 and 1
                except (ValueError, TypeError):
                    logger.warning(f"PO: Invalid target/baseline for objective '{obj_id}'. Skipping score calculation.")
                    objective_scores[obj_id] = 0.0 # Or None?
            else:
                logger.debug(f"PO: Cannot evaluate objective '{obj_id}'. Missing metric '{metric_name}' in state or target not set.")
                objective_scores[obj_id] = 0.0 # Or None?
        return objective_scores

    async def check_and_optimize(self, input_data: Dict[str, Any]) -> Dict[str, Any]:
        """ Evaluates objectives async and triggers HDE Tool API proxy if needed. Expects/Returns MIZ OKI payload. """
        start_time = time.monotonic(); errors = []
        # Parse MIZ OKI input
        payload = input_data.get("payload", {}); predictive = payload.get("predictive", False); context = payload.get("context", {}) # Optional context
        trace_id = input_data.get("trace_id"); request_id = input_data.get("request_id")

        # Prepare MIZ OKI response
        response = self._create_miz_oki_response(input_data, "pending")
        opt_id = f"po_check_{uuid.uuid4().hex[:8]}"
        opt_log = { # Detailed log
            "optimization_id": opt_id, "trace_id": trace_id,
            "timestamp": response["timestamp"], "predictive_check": predictive,
            "status": "started", "state_evaluated": None, "eval_state_type": None,
            "objective_scores": {}, "triggered_hde": False, "hde_decision_log_ref": None
        }

        try:
            # 1. Get current/predicted state
            # --- TODO: Fetch current_state from persistent metric store ---
            current_state = {m: hist[-1]["value"] for m, hist in self.metric_history.items() if hist} # Placeholder using in-memory
            # --- End TODO ---
            state_to_evaluate = current_state; eval_state_type = "current"

            if predictive:
                predicted_state = {}
                predict_tasks = [self._predict_metric_value(metric, trace_id) for metric in self.targets.keys()]
                predictions = await asyncio.gather(*predict_tasks)
                for i, metric in enumerate(self.targets.keys()):
                    if predictions[i] is not None: predicted_state[metric] = predictions[i]
                if predicted_state:
                    state_to_evaluate = predicted_state; eval_state_type = "predicted"
                    logger.info(f"PO {opt_id}: Using predicted state for evaluation: {predicted_state}")
                else: logger.warning(f"PO {opt_id}: Predictive check requested but failed to get predictions. Using current state.")

            opt_log["state_evaluated"] = state_to_evaluate; opt_log["eval_state_type"] = eval_state_type
            if not state_to_evaluate: raise ValueError("Insufficient metric data for evaluation.")

            # 2. Evaluate objectives (Sync logic is fine here)
            objective_scores = self._evaluate_objectives(state_to_evaluate)
            opt_log["objective_scores"] = objective_scores
            logger.info(f"PO {opt_id}: Evaluated objective scores ({eval_state_type} state): {objective_scores}")

            # 3. Check Thresholds and Trigger HDE via API Proxy
            threshold = self.config.system_thresholds.optimization_threshold
            failing_objectives = {obj_id: score for obj_id, score in objective_scores.items() if score < threshold}

            if failing_objectives:
                # Prioritize the objective with the lowest score
                primary_failing_id, lowest_score = sorted(failing_objectives.items(), key=lambda item: item[1])[0]
                trigger_reason = f"Objective '{primary_failing_id}' score ({lowest_score:.3f}) below threshold ({threshold:.3f})."
                self.logger.warning(f"PO {opt_id}: {trigger_reason} Triggering HDE Tool API proxy for 'system_optimization'.")
                opt_log["triggered_hde"] = True; opt_log["trigger_reason"] = trigger_reason

                if not self.hde_tool or not hasattr(self.hde_tool, 'make_decision'):
                    raise RuntimeError("HDE Tool proxy unavailable or method missing.")

                # Prepare MIZ OKI payload for HDE Tool API proxy
                hde_context = {"current_metrics": current_state, "predicted_metrics": predicted_state if predictive else None, "objective_scores": objective_scores, "failing_objectives": failing_objectives, **context} # Pass context from input
                hde_request = {
                    "payload": {"decision_type": "system_optimization", "context": hde_context},
                    "trace_id": trace_id, "request_id": f"hde_opt_{opt_id}"
                }
                # Call HDE Tool API via proxy
                decision_response = await self.hde_tool.make_decision(input_data=hde_request) # Pass MIZ OKI structure

                decision_log = decision_response.get("payload", {}) # HDE returns its log in the payload
                opt_log["hde_decision_log_ref"] = decision_log.get("decision_id")
                opt_log["hde_decision_status"] = decision_log.get("status")

                # Determine PO status based on HDE outcome
                if decision_log.get("status") in ["success", "approved_by_engine"] and decision_log.get("action_recommended"):
                    opt_log["status"] = "optimization_action_recommended"
                elif decision_log.get("status") == "ethics_review_required":
                     opt_log["status"] = "optimization_needs_review"
                else: # HDE failed, or succeeded but recommended no action
                    opt_log["status"] = "optimization_check_complete_no_action"
            else:
                opt_log["status"] = "objectives_met"
                logger.info(f"PO {opt_id}: All objectives met or above threshold.")

            response["status"] = "success" # PO check itself succeeded
            response["payload"] = opt_log # Return PO log

        except Exception as e:
             logger.error(f"PO check_and_optimize FAILED (ID: {opt_id}): {e}", exc_info=True)
             response["status"] = "internal_error"; errors.append({"code": "PO_ERROR", "message": str(e)})
             opt_log["status"] = "failed"; opt_log["error"] = str(e)
             response["payload"] = opt_log # Return log even on failure

        opt_log["total_duration_ms"] = (time.monotonic() - start_time) * 1000
        self.optimization_history.append(opt_log) # Add to in-memory history
        response["metadata"]["processing_duration_ms"] = opt_log["total_duration_ms"]
        if errors: response["error_details"] = errors
        # --- TODO: Persist opt_log async via KG Tool API or logging service ---
        return response

    async def get_current_objective_priorities(self, input_data: Dict = None) -> Dict:
         """ Returns current objective priorities (lower score = higher priority). Expects/Returns MIZ OKI. """
         start_time = time.monotonic(); errors = []
         request_data = input_data or {}
         trace_id = request_data.get("trace_id")
         status = "pending"; response_payload = None

         try:
             # --- TODO: Fetch current_state from persistent metric store ---
             current_state = {m: hist[-1]["value"] for m, hist in self.metric_history.items() if hist} # Placeholder
             # --- End TODO ---
             if not current_state: raise ValueError("Insufficient metric data to calculate priorities.")

             objective_scores = self._evaluate_objectives(current_state)
             # Simple priority: 1.0 - score (lower score means higher priority)
             priorities = {obj_id: max(0.0, min(1.0, 1.0 - score)) for obj_id, score in objective_scores.items()}

             status = "success"; response_payload = {"priorities": priorities, "scores_evaluated": objective_scores}
             logger.debug(f"PO: Calculated objective priorities: {priorities}")

         except Exception as e:
              status = "internal_error"; errors.append({"code": "PRIORITY_ERROR", "message": str(e)})
              logger.error(f"PO: Failed to get objective priorities: {e}", exc_info=True)

         response = self._create_miz_oki_response(request_data, status, response_payload, errors if errors else None)
         response["metadata"]["processing_duration_ms"] = (time.monotonic() - start_time) * 1000
         return response

# --- Autonomous Goal Generator Tool (Reworked Async) ---
class AutonomousGoalGeneratorTool:
    """ Generates goals based on optimization status, triggers planning workflow via REAL Vertex client proxy. Deployed as a service. """
    def __init__(self, optimizer_tool_proxy: Any, workflow_client_proxy: Any, config: EnhancedConfig):
        # Inject dependencies
        if not config or not optimizer_tool_proxy or not workflow_client_proxy:
            raise InitializationError("AutonomousGoalGeneratorTool requires config and proxies for Optimizer Tool and Workflow Client.")
        self.optimizer_tool = optimizer_tool_proxy
        self.workflow_client = workflow_client_proxy # Use REAL client proxy
        self.config = config
        self.project = config.gcp.project_id
        self.location = config.gcp.region
        # --- TODO: Replace dict/deque with persistent storage (e.g., Firestore, BQ) ---
        self.goals: Dict[str, Dict] = {}
        self.goal_history = deque(maxlen=1000)
        # --- End TODO ---
        self.logger = logging.getLogger('MIZ-OKI.AutonomousGoalGeneratorTool')
        self.logger.info("Autonomous Goal Generator Tool logic initialized (Reworked).")

    def _create_miz_oki_response(self, request_data: Dict, status: str, payload: Optional[Dict] = None, errors: Optional[List[Dict]] = None) -> Dict:
        """Helper to construct a standard MIZ OKI response."""
        return {
            "miz_oki_version": self.config.miz_oki_schema_version,
            "request_id": request_data.get("request_id"), "trace_id": request_data.get("trace_id"),
            "workflow_execution_id": request_data.get("workflow_execution_id"), "step_id": request_data.get("step_id"),
            "timestamp": datetime.now(datetime.timezone.utc).isoformat(),
            "source_component": "AutonomousGoalGeneratorTool", "target_component": request_data.get("source_component"),
            "status": status, "payload": payload, "error_details": errors, "metadata": {}
        }

    async def identify_and_generate_goals(self, input_data: Dict[str, Any]) -> Dict[str, Any]:
        """ Analyzes objectives via PO Tool API proxy, triggers planning workflows via Vertex client proxy. Expects/Returns MIZ OKI. """
        start_time = time.monotonic(); errors = []
        # Parse MIZ OKI input
        trace_id = input_data.get("trace_id"); request_id = input_data.get("request_id")

        # Prepare MIZ OKI response
        response = self._create_miz_oki_response(input_data, "pending")
        agg_id = f"agg_run_{uuid.uuid4().hex[:8]}"
        run_log = { # Detailed log
            "agg_run_id": agg_id, "trace_id": trace_id,
            "timestamp": response["timestamp"], "status": "started",
            "goals_generated": 0, "triggered_workflows": [], "objective_priorities_evaluated": None
        }

        try:
            if not self.optimizer_tool or not hasattr(self.optimizer_tool, 'get_current_objective_priorities'):
                 raise InitializationError("Optimizer Tool proxy unavailable or method missing.")

            # 1. Call PO Tool API proxy to get objective priorities
            po_request = {"miz_oki_version": "3.0", "trace_id": trace_id, "request_id": f"po_get_prio_for_agg_{agg_id}"}
            po_response = await self.optimizer_tool.get_current_objective_priorities(input_data=po_request) # Call proxy method

            if po_response.get("status") != "success":
                raise RuntimeError(f"Failed to get objective priorities from PO Tool API proxy: {po_response.get('error_details')}")

            objective_priorities = po_response.get("payload", {}).get("priorities", {}) # Lower score = higher priority
            run_log["objective_priorities_evaluated"] = objective_priorities
            logger.info(f"AGG {agg_id}: Evaluated objective priorities: {objective_priorities}")

            # 2. Identify objectives needing goals
            goal_gen_threshold = self.config.system_thresholds.goal_generation_threshold # Lower score = higher priority
            generation_tasks = []
            # --- TODO: Load active goals state from persistent store ---
            active_goal_objectives = {g.get("related_objective_id") for g in self.goals.values() if g.get("status") == "active"} # Using in-memory placeholder
            # --- End TODO ---

            for obj_id, priority_score in objective_priorities.items():
                 # Trigger goal if priority is high (score is low) and no active goal exists for this objective
                 if priority_score > (1.0 - goal_gen_threshold) and obj_id not in active_goal_objectives:
                      logger.info(f"AGG {agg_id}: Objective '{obj_id}' priority ({priority_score:.3f}) exceeds threshold ({1.0 - goal_gen_threshold:.3f}). Queueing goal generation.")
                      # Pass objective ID and score to generation function
                      generation_tasks.append(self._generate_and_trigger_goal(obj_id, priority_score, trace_id))
                 else:
                      logger.debug(f"AGG {agg_id}: Skipping goal generation for objective '{obj_id}' (Priority: {priority_score:.3f}, Active: {obj_id in active_goal_objectives}).")

            # 3. Trigger Goal Generation & Planning Workflows Concurrently
            if generation_tasks:
                 results = await asyncio.gather(*generation_tasks, return_exceptions=True)
                 for i, res in enumerate(results):
                      if isinstance(res, dict) and res.get("status") == "planning_triggered":
                           run_log["goals_generated"] += 1
                           run_log["triggered_workflows"].append(res) # Contains goal_id, execution_id
                      elif isinstance(res, Exception):
                           logger.error(f"AGG {agg_id}: Goal generation/trigger failed: {res}", exc_info=True)
                           errors.append({"code": "GOAL_GEN_ERROR", "message": str(res)})
                      else: # Handle unexpected non-exception, non-dict results
                           logger.error(f"AGG {agg_id}: Received unexpected result from goal generation: {res}")
                           errors.append({"code": "UNEXPECTED_RESULT", "message": "Unexpected result during goal generation."})

            run_log["status"] = "success" if not errors else "partial_failure"
            response["status"] = run_log["status"]
            response["payload"] = run_log

        except Exception as e:
             logger.error(f"AGG identify_and_generate_goals FAILED (ID: {agg_id}): {e}", exc_info=True)
             response["status"] = "internal_error"; errors.append({"code": "AGG_ERROR", "message": str(e)})
             run_log["status"] = "failed"; run_log["error"] = str(e)
             response["payload"] = run_log # Return log even on failure

        run_log["total_duration_ms"] = (time.monotonic() - start_time) * 1000
        response["metadata"]["processing_duration_ms"] = run_log["total_duration_ms"]
        if errors: response["error_details"] = errors
        self.logger.info(f"AGG run {agg_id} finished. Status: {response['status']}. Goals Generated: {run_log['goals_generated']}")
        return response

    async def _generate_and_trigger_goal(self, obj_id: str, priority_score: float, trace_id: Optional[str]) -> Dict:
        """ Generates goal data, triggers planning workflow via REAL Vertex client proxy. """
        goal_id = f"goal_{obj_id}_{uuid.uuid4().hex[:6]}"
        try:
            # --- Goal Data Generation (Sync logic ok) ---
            obj_config = self.config.get(f"optimizer_objectives.{obj_id}", {})
            kpis = [m['name'] for m in obj_config.get("metrics", []) if 'name' in m]
            # Priority mapping (e.g., higher score -> higher priority value 0-1)
            priority_value = max(0.1, min(1.0, priority_score * 1.2)) # Example mapping

            # Fetch targets/baselines from config
            target_values = {kpi: self.config.get(f"business_impact.kpis.{kpi}.target") for kpi in kpis if self.config.get(f"business_impact.kpis.{kpi}.target") is not None}
            baseline_values = {kpi: self.config.get(f"business_impact.kpis.{kpi}.baseline") for kpi in kpis if self.config.get(f"business_impact.kpis.{kpi}.baseline") is not None}

            goal_data = {
                "id": goal_id,
                "description": f"Autonomously generated goal to improve objective '{obj_config.get('name', obj_id)}' (Current Priority Score: {priority_score:.3f})",
                "kpis": kpis,
                "target_values": target_values,
                "baseline_values": baseline_values,
                "related_objective_id": obj_id,
                "priority": priority_value,
                "status": "planning", # Initial status
                "source": "AutonomousGoalGeneratorTool",
                "created_at": datetime.now(datetime.timezone.utc).isoformat(),
                "trace_id": trace_id
            }
            self.goals[goal_id] = goal_data # Add to in-memory store
            # --- TODO: Persist goal_data async (e.g., Firestore, BQ) ---
            # await self._persist_goal(goal_data)
            # --- End TODO ---
            # --- End Goal Data Generation ---

            # Trigger Planning Workflow via Vertex Client Proxy
            if not self.workflow_client: raise RuntimeError("Workflow client proxy unavailable.")
            if not self.project: raise ConfigurationError("GCP Project ID not configured.")

            planning_workflow_id = self.config.vertex_ai.planning_workflow_id
            if not planning_workflow_id: raise ConfigurationError("Planning workflow ID not configured.")

            # Prepare MIZ OKI input for the planning workflow
            workflow_input_payload = {"goal_id": goal_id, "goal_details": goal_data}
            miz_oki_input = {
                "miz_oki_version": self.config.miz_oki_schema_version,
                "request_id": f"req_plan_{goal_id}", "trace_id": trace_id,
                "source_component": "AutonomousGoalGeneratorTool", "target_component": planning_workflow_id,
                "payload": workflow_input_payload
            }
            # Use real Vertex client proxy method
            execution_name = await self.workflow_client.start_workflow(
                project=self.project, location=self.location, workflow_id=planning_workflow_id, miz_oki_input=miz_oki_input
            )

            if execution_name:
                 self.goals[goal_id]["planning_execution_id"] = execution_name
                 self.goals[goal_id]["status"] = "planning" # Update status
                 # --- TODO: Persist goal update async ---
                 # await self._persist_goal(self.goals[goal_id])
                 # --- End TODO ---
                 self.logger.info(f"Triggered planning workflow {planning_workflow_id} (Exec: {execution_name}) for goal {goal_id}.")
                 return {"status": "planning_triggered", "goal_id": goal_id, "execution_id": execution_name}
            else:
                # If start_workflow returns None or empty string on failure
                raise RuntimeError("Vertex client proxy failed to start planning workflow (returned no execution name).")

        except Exception as e:
            logger.error(f"Failed to generate/trigger goal for objective {obj_id}: {e}", exc_info=True)
            self.goals.pop(goal_id, None) # Remove partial goal on failure
            # --- TODO: Remove from persistent store if creation failed mid-way ---
            raise # Re-raise exception to be caught by the caller

    # --- Add get_active_goals and add_goal methods (conceptual, need persistence) ---
    async def get_active_goals(self, request: Dict) -> Dict: # Expects MIZ OKI
        """ Retrieves active goals. Needs persistent store integration. """
        start_time = time.monotonic(); errors = []
        status = "pending"; response_payload = None
        try:
            # --- TODO: Load goals from persistent store ---
            active_goals_list = [g for g in self.goals.values() if g.get("status") == "active"] # Using in-memory placeholder
            # --- End TODO ---
            status = "success"; response_payload = {"active_goals": active_goals_list}
        except Exception as e:
            status = "internal_error"; errors.append({"code": "GOAL_FETCH_ERROR", "message": str(e)})
            logger.error(f"Error fetching active goals: {e}", exc_info=True)

        response = self._create_miz_oki_response(request, status, response_payload, errors if errors else None)
        response["metadata"]["processing_duration_ms"] = (time.monotonic() - start_time) * 1000
        return response

    async def add_goal(self, request: Dict) -> Dict: # Expects MIZ OKI
        """ Adds a manual goal and triggers planning. Needs persistence. """
        start_time = time.monotonic(); errors = []
        payload = request.get("payload", {})
        # Basic validation
        if not all(k in payload for k in ["description", "kpis"]):
             errors.append({"code": "MISSING_DATA", "message": "'description' and 'kpis' required in payload."})
             status = "bad_request"
        else:
            status = "pending"; response_payload = None
            try:
                # --- TODO: Implement persistence and trigger planning workflow ---
                # 1. Generate goal_id
                # 2. Construct goal_data dict similar to _generate_and_trigger_goal
                # 3. Persist goal_data
                # 4. Trigger planning workflow via self.workflow_client.start_workflow
                # 5. Update persisted goal with execution_id
                # --- Placeholder ---
                goal_id = f"goal_manual_{uuid.uuid4().hex[:6]}"
                logger.info(f"Placeholder: Adding manual goal {goal_id}. Needs persistence and workflow trigger.")
                # Simulate triggering workflow
                await asyncio.sleep(0.1)
                execution_id = f"projects/p/locations/l/workflows/w/executions/exec_manual_{goal_id}"
                # --- End Placeholder ---
                status = "success"; response_payload = {"goal_id": goal_id, "planning_execution_id": execution_id}

            except Exception as e:
                status = "internal_error"; errors.append({"code": "ADD_GOAL_ERROR", "message": str(e)})
                logger.error(f"Error adding manual goal: {e}", exc_info=True)

        response = self._create_miz_oki_response(request, status, response_payload, errors if errors else None)
        response["metadata"]["processing_duration_ms"] = (time.monotonic() - start_time) * 1000
        return response

# --- Self-Correcting Feedback Tool (Reworked Async) ---
class SelfCorrectingFeedbackTool:
    """ Processes feedback async, triggers LI Tool API proxy. Deployed as a service. """
    def __init__(self, kg_tool_proxy: Any, learning_integrator_tool_proxy: Any, config: EnhancedConfig):
        # Inject dependencies
        if not config or not kg_tool_proxy or not learning_integrator_tool_proxy:
            raise InitializationError("SelfCorrectingFeedbackTool requires config and proxies for KG and LI tools.")
        self.kg_tool = kg_tool_proxy
        self.learning_integrator_tool = learning_integrator_tool_proxy
        self.config = config
        # TODO: Replace deque with persistent storage for production
        self.feedback_history = deque(maxlen=5000)
        self.correction_rules = defaultdict(dict) # Rules based on entity type and feedback content
        self.logger = logging.getLogger('MIZ-OKI.SelfCorrectingFeedbackTool')
        self._load_rules() # Sync load ok
        self.logger.info("Self-Correcting Feedback Tool logic initialized (Reworked).")

    def _create_miz_oki_response(self, request_data: Dict, status: str, payload: Optional[Dict] = None, errors: Optional[List[Dict]] = None) -> Dict:
        """Helper to construct a standard MIZ OKI response."""
        return {
            "miz_oki_version": self.config.miz_oki_schema_version,
            "request_id": request_data.get("request_id"), "trace_id": request_data.get("trace_id"),
            "workflow_execution_id": request_data.get("workflow_execution_id"), "step_id": request_data.get("step_id"),
            "timestamp": datetime.now(datetime.timezone.utc).isoformat(),
            "source_component": "SelfCorrectingFeedbackTool", "target_component": request_data.get("source_component"),
            "status": status, "payload": payload, "error_details": errors, "metadata": {}
        }

    def _load_rules(self):
        """Loads correction rules (e.g., from config file, DB)."""
        # --- TODO: Load rules from a persistent/configurable source ---
        # Example rules:
        self.correction_rules['ModelPrediction'] = [
            {"condition": "feedback_sentiment == 'negative' and confidence < 0.5", "action": "trigger_learning", "params": {"knowledge_type": "model_correction_low_conf", "importance": 0.6}},
            {"condition": "feedback_contains('incorrect_entity')", "action": "update_kg", "params": {"label": "IncorrectPrediction"}},
            {"condition": "feedback_rating <= 2", "action": "trigger_learning", "params": {"knowledge_type": "model_correction_low_rating", "importance": 0.8}}
        ]
        self.correction_rules['KnowledgeGraphEntity'] = [
             {"condition": "feedback_type == 'data_error'", "action": "update_kg", "params": {"label": "DataQualityIssue"}},
        ]
        # --- End TODO ---
        self.logger.info(f"Loaded {sum(len(v) for v in self.correction_rules.values())} correction rules.")

    async def process_feedback(self, input_data: Dict[str, Any]) -> Dict[str, Any]:
        """ Processes feedback async. Expects/Returns MIZ OKI payload. """
        start_time = time.monotonic(); errors = []
        # Parse MIZ OKI input
        payload = input_data.get("payload", {}); entity_id = payload.get("entity_id"); feedback_data = payload.get("feedback_data"); source = payload.get("source")
        trace_id = input_data.get("trace_id"); request_id = input_data.get("request_id")

        # Prepare MIZ OKI response
        response = self._create_miz_oki_response(input_data, "pending")
        feedback_id = f"fb_{entity_id or 'general'}_{uuid.uuid4().hex[:8]}"
        log_entry = { # Detailed log
            "feedback_id": feedback_id, "trace_id": trace_id,
            "timestamp": response["timestamp"], "entity_id": entity_id, "source": source,
            "input_preview": str(feedback_data)[:250], # Limit preview size
            "status": "pending", "entity_type": None, "corrections_identified": [], "triggered_li_integrations": []
        }

        try:
            # Basic validation
            if not feedback_data: raise ValueError("Missing 'feedback_data' in payload.")
            if not self.learning_integrator_tool: raise InitializationError("Learning Integrator Tool proxy unavailable.")

            entity_type = "unknown"
            # 1. Get Entity Context (Optional, if entity_id provided)
            if entity_id and self.kg_tool:
                try:
                    kg_request = {"payload": {"mizId": entity_id}, "trace_id": trace_id, "request_id": f"kg_get_fb_entity_{feedback_id}"}
                    kg_response = await self.kg_tool.get_entity_endpoint(request=kg_request) # Call KG API proxy
                    if kg_response.get("status") == "success":
                        entity = kg_response.get("payload", {}).get("entity_data", {})
                        entity_type = entity.get("entity_type", "unknown")
                        log_entry["entity_type"] = entity_type
                        logger.debug(f"SCF {feedback_id}: Fetched context for entity {entity_id} (Type: {entity_type}).")
                    else:
                        logger.warning(f"SCF {feedback_id}: Could not fetch entity {entity_id} from KG: {kg_response.get('error_details')}")
                except Exception as kg_e:
                     logger.error(f"SCF {feedback_id}: Error fetching entity context from KG: {kg_e}")
                     # Continue without entity context if KG fails

            # 2. Determine Correction Actions based on Rules
            correction_actions = []
            rules_for_type = self.correction_rules.get(entity_type, []) + self.correction_rules.get("default", []) # Include default rules
            if rules_for_type:
                logger.debug(f"SCF {feedback_id}: Applying {len(rules_for_type)} rules for type '{entity_type}'.")
                for rule in rules_for_type:
                    condition = rule.get("condition")
                    # --- TODO: Implement condition evaluation logic ---
                    # This needs a safe way to evaluate conditions against feedback_data and entity context
                    # Example: evaluate_condition(condition, {"feedback": feedback_data, "entity": entity})
                    condition_met = True # Placeholder
                    # --- End TODO ---
                    if condition_met:
                        correction_actions.append(rule) # Add rule (contains action type and params)
                        log_entry["corrections_identified"].append({"rule_condition": condition, "action": rule.get("action")})
            else:
                logger.info(f"SCF {feedback_id}: No specific correction rules found for type '{entity_type}'. Default actions might apply.")
                # Optionally define default actions, e.g., always trigger learning
                correction_actions.append({"action": "trigger_learning", "params": {"knowledge_type": "general_feedback", "importance": 0.5}})
                log_entry["corrections_identified"].append({"rule_condition": "default", "action": "trigger_learning"})


            # 3. Trigger Learning Integrator Tool API Proxy for determined actions
            if correction_actions:
                li_tasks = []
                for action_rule in correction_actions:
                    if action_rule.get("action") == "trigger_learning":
                        li_payload = {
                            "knowledge_type": action_rule.get("params", {}).get("knowledge_type", "correction"),
                            "knowledge_data": {"feedback": feedback_data, "entity_id": entity_id, "entity_type": entity_type}, # Pass relevant data
                            "source": f"scf:{feedback_id}",
                            "importance": action_rule.get("params", {}).get("importance", 0.8)
                        }
                        li_request = {
                            "payload": li_payload,
                            "trace_id": trace_id, "request_id": f"li_trigger_{feedback_id}_{uuid.uuid4().hex[:4]}"
                        }
                        li_tasks.append(self.learning_integrator_tool.integrate_learning(input_data=li_request)) # Call LI API proxy
                    elif action_rule.get("action") == "update_kg":
                        # --- TODO: Implement KG Update via KG Tool API Proxy ---
                        logger.warning(f"SCF {feedback_id}: KG Update action not yet implemented.")
                        # --- End TODO ---
                    # Add other action types if needed

                if li_tasks:
                      li_results = await asyncio.gather(*li_tasks, return_exceptions=True)
                      # Parse MIZ OKI responses from LI Tool
                      log_entry["triggered_li_integrations"] = []
                      for res in li_results:
                          if isinstance(res, dict) and res.get("status") == "success":
                              log_entry["triggered_li_integrations"].append(res.get("payload", {}))
                          elif isinstance(res, dict): # LI Tool reported an error
                              log_entry["triggered_li_integrations"].append({"error": res.get("error_details")})
                              logger.error(f"SCF {feedback_id}: LI Tool API proxy call failed: {res.get('error_details')}")
                          else: # Exception during call
                              log_entry["triggered_li_integrations"].append({"error": str(res)})
                              logger.error(f"SCF {feedback_id}: Exception calling LI Tool API proxy: {res}")

            response["status"] = "success"
            log_entry["status"] = "processed"
            response["payload"] = log_entry # Return the detailed log

        except Exception as e:
             logger.error(f"Error processing feedback async via SCF Tool for {entity_id}: {e}", exc_info=True)
             response["status"] = "internal_error"; errors.append({"code": "SCF_ERROR", "message": str(e)})
             log_entry["status"] = "failed"; log_entry["error"] = str(e)
             response["payload"] = log_entry # Return log even on failure

        log_entry["total_duration_ms"] = (time.monotonic() - start_time) * 1000
        self.feedback_history.append(log_entry) # Add to in-memory history
        response["metadata"]["processing_duration_ms"] = log_entry["total_duration_ms"]
        if errors: response["error_details"] = errors
        # --- TODO: Persist log_entry async via KG Tool API or logging service ---
        return response

# --- Initialization (Conceptual - Tools instantiated by orchestrator/framework) ---
# _ethical_guardrails_tool: Optional[EthicalGuardrailsTool] = None
# _hde_tool: Optional[HybridDecisionEngineTool] = None
# _optimizer_tool: Optional[HolisticOptimizerTool] = None
# _li_tool: Optional[LearningIntegrationTool] = None # This is the one being called by SCF
# _agg_tool: Optional[AutonomousGoalGeneratorTool] = None
# _scf_tool: Optional[SelfCorrectingFeedbackTool] = None

# async def initialize_core_processes():
#      global _ethical_guardrails_tool, _hde_tool, _optimizer_tool, _li_tool, _agg_tool, _scf_tool
#      if not _config_obj or not _real_dependencies:
#          logger.critical("Cannot initialize Core Processes: Config or dependencies missing.")
#          return
#      try:
#          _ethical_guardrails_tool = EthicalGuardrailsTool(_config_obj) # Sync init ok
#          _hde_tool = HybridDecisionEngineTool(_config_obj, _kg_tool_proxy, _moe_registry_proxy, _expert_invoker_proxy, _ethical_guardrails_tool, _fm_client_proxy)
#          _optimizer_tool = HolisticOptimizerTool(_config_obj, _kg_tool_proxy, _hde_tool, _moe_registry_proxy, _expert_invoker_proxy) # Pass HDE proxy
#          _li_tool = LearningIntegrationTool(_kg_tool_proxy, _moe_registry_proxy, _expert_invoker_proxy, _pubsub_client_proxy, _config_obj)
#          _agg_tool = AutonomousGoalGeneratorTool(_optimizer_tool, _workflow_client_proxy, _config_obj) # Pass Optimizer proxy, REAL workflow client proxy
#          _scf_tool = SelfCorrectingFeedbackTool(_kg_tool_proxy, _li_tool, _config_obj) # Pass KG and LI proxies
#          logger.info("Core Process Tools initialized.")
#      except Exception as e:
#           logger.critical(f"Core Process Tools initialization failed: {e}", exc_info=True)
#           # Set all to None on failure
#           _ethical_guardrails_tool = _hde_tool = _optimizer_tool = _li_tool = _agg_tool = _scf_tool = None

print("\n--- MIZ 3.0 Core Processes Layer Logic (Cell 5 - Reworked) ---")
print("Tool logic uses real dependencies/proxies via MIZ OKI APIs/Events.")
print("Handles MIZ OKI payloads for API interaction. Async implementation.")
print("Requires implementation of Causal/Sim tools, config/rule loading, persistence.")
print("----------------------------------------------------------------------")



SyntaxError: invalid syntax (<ipython-input-9-005ee98b627a>, line 49)

In [10]:
# Cell 6: Technical Flow Components (Reworked)
# Status: RAG, R2, NN Tools refactored as async services handling MIZ OKI payloads.
#         Uses real dependencies/proxies via MIZ OKI APIs.
#         RL/MoE base classes unchanged (logic used externally).

import numpy as np
import os
import json
import logging
import time
import random
from typing import Dict, List, Any, Optional, Union, Tuple, Callable
from collections import deque, defaultdict # Added defaultdict
import asyncio
import re # For R2 template parsing

# --- Assume Real Tool/Client Dependencies are Injected/Available ---
# These proxies represent API clients for other deployed MIZ OKI services.
try:
    # Use CONFIG_OBJ loaded in Cell 1
    if 'CONFIG_OBJ' not in globals() or not CONFIG_OBJ: raise NameError("CONFIG_OBJ not found or is None")
    if not isinstance(CONFIG_OBJ, EnhancedConfig): raise NameError("CONFIG_OBJ is not an EnhancedConfig instance")

    # Proxies for other MIZ OKI Tool APIs
    if 'kg_tool_service_instance' not in globals(): raise NameError("kg_tool_service_instance proxy not found") # Cell 3 Proxy
    if 'foundation_model_client' not in globals(): raise NameError("foundation_model_client proxy not found") # Cell 18 Proxy
    if 'hde_tool' not in globals(): raise NameError("hde_tool proxy not found") # Cell 5 Proxy

    _config_obj = CONFIG_OBJ
    _kg_tool_proxy = kg_tool_service_instance
    _fm_client_proxy = foundation_model_client
    _hde_tool_proxy = hde_tool # Inject HDE tool proxy for R2

    # NN Tool delegates to FM Client
    class NeuralProcessingToolDelegate:
        """Simple delegate class for NN embedding tasks using FM Client."""
        def __init__(self, fm_client_proxy: Any):
            self.fm_client = fm_client_proxy
            self.logger = logging.getLogger('MIZ-OKI.NNToolDelegate')
            if not self.fm_client: self.logger.error("NNToolDelegate initialized without FM Client proxy!")

        async def get_embedding(self, input_data: Dict[str, Any]) -> Dict[str, Any]:
            """Forward embedding request to FM Client Tool."""
            if not self.fm_client: return {"status": "error", "error_details": [{"message": "FM Client unavailable"}]}
            # Assume input_data is already a valid MIZ OKI request for the FM Client
            return await self.fm_client.generate_embedding(input_data=input_data)

        async def batch_embed(self, input_data: Dict[str, Any]) -> Dict[str, Any]:
            """Forward batch embedding request to FM Client Tool."""
            if not self.fm_client: return {"status": "error", "error_details": [{"message": "FM Client unavailable"}]}
            # FM client's generate_embedding handles list input for batching
            return await self.fm_client.generate_embedding(input_data=input_data)

    _nn_tool_proxy = NeuralProcessingToolDelegate(_fm_client_proxy)

    _real_dependencies = True
    logger.debug("Using real/conceptual dependencies in Cell 6 (Reworked).")

except NameError as e:
    logger.warning(f"Dependency Error in Cell 6 ({e}). Using Mocks/Placeholders.")
    _real_dependencies = False
    # --- Mock/Placeholder Setup ---
    class MockKGTool: async def get_entity(self, request): await asyncio.sleep(0.01); return {"status": "success", "payload": {"entity_data": {"name": "Mock Entity"}}}; async def get_neighbors(self, request): await asyncio.sleep(0.01); return {"status": "success", "payload": {"neighbors": []}}; async def search_vector_index(self, request): await asyncio.sleep(0.01); return {"status": "success", "payload": {"results": [("mock_id", 0.9, {})]}}; async def execute_query(self, request): await asyncio.sleep(0.01); return {"status": "success", "payload": {"results": []}}
    class MockFMClientTool: async def generate_text(self, input_data): await asyncio.sleep(0.01); return {"status": "success", "payload": {"generated_text": "Mock LLM"}}; async def generate_embedding(self, input_data): await asyncio.sleep(0.01); return {"status": "success", "payload": {"embedding": [[0.1]*10] if isinstance(input_data['payload']['data'], list) else [0.1]*10}}
    class MockHdeTool: async def make_decision(self, input_data): await asyncio.sleep(0.01); return {"status": "success", "payload": {"decision_id": "mock_dec"}}
    class MockNNTool: async def get_embedding(self, input_data): await asyncio.sleep(0.01); return {"status": "success", "payload": {"embedding": [0.1]*10}}; async def batch_embed(self, input_data): await asyncio.sleep(0.01); return {"status": "success", "payload": {"embeddings": [[0.1]*10]*len(input_data['payload']['data'])}}
    # Define minimal config if needed
    if 'CONFIG_OBJ' not in globals() or not CONFIG_OBJ:
        from dataclasses import dataclass, field
        @dataclass class MockGcpConfig: project_id:Optional[str]="mock-proj"; region:str="mock-region"
        @dataclass class MockKgConfig: vector_index_name: str = "mock_index"
        @dataclass class MockFmDefaults: llama4_embedding_model: str = "mock-emb"; llama4_scout: str = "mock-scout"; llama4_maverick: str = "mock-mav"
        @dataclass class MockFmConfig: defaults: MockFmDefaults = field(default_factory=MockFmDefaults)
        @dataclass class MockConfig: gcp: MockGcpConfig = field(default_factory=MockGcpConfig); kg: MockKgConfig = field(default_factory=MockKgConfig); foundation_models: MockFmConfig = field(default_factory=MockFmConfig); miz_oki_schema_version: str = "3.0"; def get_model_info(self, alias): return {"provider": "mock", "model_id": alias, "pricing": {"prompt": 0.1, "completion": 0.2}}; def get(self, key, default=None): parts=key.split('.'); val=self; try: [val := getattr(val, p) for p in parts]; return val; except: return default
        _config_obj = MockConfig()
    _kg_tool_proxy = MockKGTool(); _fm_client_proxy = MockFMClientTool(); _hde_tool_proxy = MockHdeTool(); _nn_tool_proxy = MockNNTool()
    # --- End Mock/Placeholder Setup ---

logger = logging.getLogger('MIZ-OKI.TechnicalFlows')

# --- Semantic Graph RAG (Reworked Async Tool Logic) ---
class SemanticGraphRAGTool:
    """ Implements Graph-enhanced RAG logic. Deployed as a service callable via MIZ OKI API. """
    def __init__(self, config: EnhancedConfig, kg_tool_proxy: Any, fm_client_proxy: Any, nn_tool_proxy: Any):
        if not all([config, kg_tool_proxy, fm_client_proxy, nn_tool_proxy]):
            raise InitializationError("SemanticGraphRAGTool requires config, KG, FM, and NN tool proxies.")
        self.config = config
        self.kg_tool = kg_tool_proxy
        self.fm_client = fm_client_proxy
        self.nn_tool = nn_tool_proxy # Used for embeddings via FM Client delegate
        self.default_vector_index = config.kg.vector_index_name
        self.default_embedding_alias = config.foundation_models.defaults.llama4_embedding_model
        self.logger = logging.getLogger('MIZ-OKI.SemanticGraphRAGTool')
        self.logger.info("Semantic Graph RAG Tool logic initialized (Reworked).")

    def _create_miz_oki_response(self, request_data: Dict, status: str, payload: Optional[Dict] = None, errors: Optional[List[Dict]] = None) -> Dict:
        """Helper to construct a standard MIZ OKI response."""
        return {
            "miz_oki_version": self.config.miz_oki_schema_version,
            "request_id": request_data.get("request_id"), "trace_id": request_data.get("trace_id"),
            "workflow_execution_id": request_data.get("workflow_execution_id"), "step_id": request_data.get("step_id"),
            "timestamp": datetime.now(datetime.timezone.utc).isoformat(),
            "source_component": "SemanticGraphRAGTool", "target_component": request_data.get("source_component"),
            "status": status, "payload": payload, "error_details": errors, "metadata": {}
        }

    async def _get_embedding(self, text: str, model_alias: Optional[str] = None, trace_id: Optional[str] = None) -> Optional[List[float]]:
        """ Helper to get embedding via NN/FM Client Tool API proxy. """
        if not self.nn_tool:
            self.logger.error("NN/FM Tool proxy unavailable for embedding generation.")
            return None
        target_model_alias = model_alias or self.default_embedding_alias
        try:
            # Prepare MIZ OKI request for NN/FM Tool
            nn_request = {
                "payload": {"data": text, "model_alias": target_model_alias},
                "trace_id": trace_id, "request_id": f"rag_embed_{uuid.uuid4().hex[:6]}"
            }
            nn_response = await self.nn_tool.get_embedding(input_data=nn_request) # Call API proxy

            if nn_response.get("status") == "success":
                embedding = nn_response.get("payload", {}).get("embedding")
                # Handle both single and potential batch returns defensively
                if isinstance(embedding, list) and len(embedding) > 0 and isinstance(embedding[0], float):
                    return embedding # Single embedding returned as list of floats
                elif isinstance(embedding, list) and len(embedding) > 0 and isinstance(embedding[0], list) and isinstance(embedding[0][0], float):
                    return embedding[0] # First embedding from a batch result
                else:
                    logger.error(f"Unexpected embedding format from NN/FM proxy: {type(embedding)}")
                    return None
            else:
                logger.error(f"NN/FM Tool API proxy failed for embedding: {nn_response.get('error_details')}")
                return None
        except Exception as e:
            logger.error(f"Failed to get embedding via NN/FM client proxy: {e}", exc_info=True)
            return None

    async def retrieve_nodes_semantic(self, input_data: Dict[str, Any]) -> Dict[str, Any]:
        """ Retrieve relevant nodes via KG Tool API proxy. Expects/Returns MIZ OKI payload structure. """
        start_time = time.monotonic(); errors = []
        # Parse MIZ OKI input
        payload = input_data.get("payload", {}); query = payload.get("query"); k = payload.get("k", 5); namespace = payload.get("namespace"); filter_dict = payload.get("filter_dict"); vector_index_name = payload.get("vector_index_name")
        trace_id = input_data.get("trace_id"); request_id = input_data.get("request_id")

        if not query: errors.append({"code": "MISSING_QUERY", "message": "Query parameter is required."})
        if not self.kg_tool: errors.append({"code": "MISSING_DEPENDENCY", "message": "KG Tool proxy unavailable."})
        if errors:
            response = self._create_miz_oki_response(input_data, "bad_request", errors=errors)
            response["metadata"]["processing_duration_ms"] = (time.monotonic() - start_time) * 1000
            return response

        target_index = vector_index_name or self.default_vector_index
        logger.debug(f"RAG: Retrieving nodes semantically. Query: '{query[:50]}...', Index: {target_index}, K={k}, Filter={filter_dict}")
        ids_scores_meta = []
        status = "pending"
        response_payload = None

        try:
            query_embedding = await self._get_embedding(query, trace_id=trace_id)
            if query_embedding is None: raise ValueError("Failed to generate query embedding via NN/FM proxy.")

            # Call KG Tool API proxy (search_vector_index endpoint)
            kg_request = {
                "payload": {"query_vector": query_embedding, "k": k, "namespace": namespace, "filter_dict": filter_dict, "index_name": target_index},
                "trace_id": trace_id, "request_id": f"kg_vec_search_{request_id}"
            }
            kg_response = await self.kg_tool.search_vector_endpoint(request=kg_request) # Call proxy method (ensure method name matches Cell 3 service)

            if kg_response.get("status") == "success":
                vector_results = kg_response.get("payload", {}).get("results", [])
                # Ensure results are in the expected format (id, score, metadata_dict)
                ids_scores_meta = [(vid, score, meta) for vid, score, meta in vector_results if vid and isinstance(score, (float, int))]
                status = "success"
                response_payload = {"retrieved_nodes": ids_scores_meta[:k]} # Ensure K results
                logger.info(f"RAG: Retrieved {len(ids_scores_meta)} nodes via KG Tool API proxy.")
            else:
                raise RuntimeError(f"KG Tool API search failed: {kg_response.get('error_details')}")

        except (ValueError, RuntimeError) as vr_e:
             status = "failed"; errors.append({"code": "RETRIEVAL_FAILED", "message": str(vr_e)}); logger.warning(f"RAG: Semantic retrieval failed: {vr_e}")
        except Exception as e:
             status = "internal_error"; errors.append({"code": "INTERNAL_ERROR", "message": str(e)}); logger.error(f"RAG: Semantic retrieval failed unexpectedly: {e}", exc_info=True)

        response = self._create_miz_oki_response(input_data, status, response_payload, errors if errors else None)
        response["metadata"]["processing_duration_ms"] = (time.monotonic() - start_time) * 1000
        return response

    async def retrieve_and_augment(self, input_data: Dict[str, Any]) -> Dict[str, Any]:
        """ Retrieves and augments nodes via KG Tool API proxy. Expects/Returns MIZ OKI. """
        start_time = time.monotonic(); errors = []
        # Parse MIZ OKI input
        payload = input_data.get("payload", {}); query = payload.get("query"); k = payload.get("k", 5); namespace = payload.get("namespace"); filter_dict = payload.get("filter_dict"); include_neighbors = payload.get("include_neighbors", True); neighbor_limit = payload.get("neighbor_limit", 3); vector_index_name = payload.get("vector_index_name")
        trace_id = input_data.get("trace_id"); request_id = input_data.get("request_id")

        if not query: errors.append({"code": "MISSING_QUERY", "message": "Query parameter is required."})
        if not self.kg_tool: errors.append({"code": "MISSING_DEPENDENCY", "message": "KG Tool proxy unavailable."})
        if errors:
            response = self._create_miz_oki_response(input_data, "bad_request", errors=errors)
            response["metadata"]["processing_duration_ms"] = (time.monotonic() - start_time) * 1000
            return response

        logger.info(f"RAG: Retrieving and augmenting nodes for query: '{query[:50]}...'")
        final_results = []; status = "pending"; response_payload = None

        try:
            # 1. Retrieve initial nodes via internal call (which uses KG API proxy)
            retrieval_request = {
                "payload": {"query": query, "k": k, "namespace": namespace, "filter_dict": filter_dict, "vector_index_name": vector_index_name},
                "trace_id": trace_id, "request_id": f"rag_retrieve_{request_id}" # Link requests
            }
            retrieval_response = await self.retrieve_nodes_semantic(retrieval_request)

            if retrieval_response.get("status") != "success":
                raise RuntimeError(f"Semantic retrieval step failed: {retrieval_response.get('error_details')}")

            top_nodes_with_meta = retrieval_response.get("payload", {}).get("retrieved_nodes", [])
            if not top_nodes_with_meta:
                status = "success_no_results"; response_payload = {"augmented_context": []}
                response = self._create_miz_oki_response(input_data, status, response_payload, errors if errors else None)
                response["metadata"]["processing_duration_ms"] = (time.monotonic() - start_time) * 1000
                return response

            # 2. Augment concurrently using KG Tool API proxy's get_entity and get_neighbors
            async def augment_single(node_id: str, score: float, node_meta: Dict) -> Optional[Dict]:
                try:
                     # Prepare MIZ OKI requests for KG Tool API proxy
                     entity_req = {"payload": {"mizId": node_id}, "trace_id": trace_id, "request_id": f"kg_get_entity_{node_id}"}
                     # Construct neighbor request payload based on KG Tool API spec
                     neighbor_payload = {"mizId": node_id, "limit": neighbor_limit}
                     if include_neighbors:
                         neighbor_req = {"payload": neighbor_payload, "trace_id": trace_id, "request_id": f"kg_get_neighbors_{node_id}"}
                         neighbors_resp_task = self.kg_tool.get_neighbors_endpoint(request=neighbor_req) # Call API proxy method
                     else:
                         neighbors_resp_task = asyncio.sleep(0, result={"status":"success", "payload": {"neighbors":[]}}) # No-op if neighbors not needed

                     entity_resp_task = self.kg_tool.get_entity_endpoint(request=entity_req) # Call API proxy method

                     entity_resp, neighbors_resp = await asyncio.gather(entity_resp_task, neighbors_resp_task, return_exceptions=True)

                     node_data = None; neighbors_data = []
                     # Process entity response
                     if isinstance(entity_resp, dict) and entity_resp.get("status") == "success":
                         node_data = entity_resp.get("payload",{}).get("entity_data")
                     elif isinstance(entity_resp, Exception):
                          logger.warning(f"RAG Augment: KG get_entity API call failed for {node_id}: {entity_resp}")
                     else: logger.warning(f"RAG Augment: KG get_entity API proxy returned unexpected status for {node_id}: {entity_resp.get('status')}")

                     # Process neighbors response
                     if isinstance(neighbors_resp, dict) and neighbors_resp.get("status") == "success":
                         neighbors_data = neighbors_resp.get("payload",{}).get("neighbors", [])
                     elif isinstance(neighbors_resp, Exception):
                          logger.warning(f"RAG Augment: KG get_neighbors API call failed for {node_id}: {neighbors_resp}")
                     else: logger.warning(f"RAG Augment: KG get_neighbors API proxy returned unexpected status for {node_id}: {neighbors_resp.get('status')}")

                     if node_data is None:
                         logger.warning(f"RAG Augment: Skipping node {node_id} due to failed entity data retrieval.")
                         return None # Skip if core node data failed to retrieve

                     # Format the result item
                     result_item = {
                         "node_id": node_id,
                         "data": node_data, # Contains full properties from get_entity
                         "score": score, # Similarity score from vector search
                         "vector_metadata": node_meta, # Metadata returned by vector search (if any)
                         "neighbors": []
                     }
                     for neighbor in neighbors_data:
                          props = neighbor.get("neighborProps", {})
                          result_item["neighbors"].append({
                              "id": neighbor.get("neighborId"),
                              "type": props.get("entity_type"), # Assuming entity_type is stored
                              "relationship": neighbor.get("relationshipType"),
                              "rel_props": neighbor.get("relationshipProps", {}),
                              "preview": props.get('name', props.get('title', neighbor.get("neighborId"))) # Simple preview
                          })
                     return result_item
                except Exception as augment_e:
                    logger.error(f"RAG Augment: Error processing node {node_id}: {augment_e}", exc_info=False)
                    return None

            augment_tasks = [augment_single(nid, score, meta) for nid, score, meta in top_nodes_with_meta]
            augmented_results = await asyncio.gather(*augment_tasks)
            final_results = [res for res in augmented_results if res is not None]

            status = "success"
            response_payload = {"augmented_context": final_results}
            logger.info(f"RAG: Successfully augmented {len(final_results)} nodes via KG Tool API proxy.")

        except (ValueError, RuntimeError) as vr_e:
             status = "failed"; errors.append({"code": "AUGMENT_FAILED", "message": str(vr_e)}); logger.warning(f"RAG: Augmentation failed: {vr_e}")
        except Exception as e:
             status = "internal_error"; errors.append({"code": "INTERNAL_ERROR", "message": str(e)}); logger.error(f"RAG: Augmentation failed unexpectedly: {e}", exc_info=True)

        response = self._create_miz_oki_response(input_data, status, response_payload, errors if errors else None)
        response["metadata"]["processing_duration_ms"] = (time.monotonic() - start_time) * 1000
        return response

    async def generate_response(self, input_data: Dict[str, Any]) -> Dict[str, Any]:
        """ Generates response via FM Client API proxy, grounded in context. Expects/Returns MIZ OKI. """
        start_time = time.monotonic(); errors = []
        # Parse MIZ OKI input
        payload = input_data.get("payload", {}); query = payload.get("query"); retrieved_context = payload.get("retrieved_context", []); model_alias = payload.get("model_alias"); max_tokens = payload.get("max_tokens", 512); temperature = payload.get("temperature", 0.2)
        trace_id = input_data.get("trace_id"); request_id = input_data.get("request_id")

        if not query: errors.append({"code": "MISSING_QUERY", "message": "Query parameter is required."})
        if not self.fm_client: errors.append({"code": "MISSING_DEPENDENCY", "message": "FM Client Tool proxy unavailable."})
        if errors:
            response = self._create_miz_oki_response(input_data, "bad_request", errors=errors)
            response["metadata"]["processing_duration_ms"] = (time.monotonic() - start_time) * 1000
            return response

        target_model_alias = model_alias or self.config.foundation_models.defaults.llama4_scout # Use scout for generation if not specified
        status = "pending"; response_payload = None; response_metadata = {}

        # Format context string concisely
        context_str = "No relevant context found."
        if retrieved_context and isinstance(retrieved_context, list):
            context_parts = []
            for item in retrieved_context[:5]: # Limit context length for prompt
                if not isinstance(item, dict): continue
                node_data = item.get('data', {})
                preview = node_data.get('name', node_data.get('title', item.get('node_id', 'Unknown Node')))
                # Simple text representation
                context_parts.append(f"- Node {item.get('node_id')}: {preview} (Score: {item.get('score', 0):.2f})")
                # Optionally add concise neighbor info
                # neighbors_preview = [f"{n.get('relationship')} -> {n.get('preview')}" for n in item.get('neighbors', [])[:2]]
                # if neighbors_preview: context_parts.append(f"  Neighbors: {'; '.join(neighbors_preview)}")
            if context_parts:
                context_str = "Relevant Context:\n" + "\n".join(context_parts)
            if len(retrieved_context) > 5:
                 context_str += f"\n... (and {len(retrieved_context) - 5} more context items)"

        # Construct prompt
        prompt = f"Based on the following context, answer the query.\n\nContext:\n---\n{context_str}\n---\n\nQuery: {query}\n\nAnswer:"
        logger.debug(f"RAG Generate Prompt (first 300 chars): {prompt[:300]}...")

        try:
            # Prepare MIZ OKI request for FM Client Tool API proxy
            fm_request = {
                "payload": {
                    "prompt": prompt,
                    "model_alias": target_model_alias,
                    "max_tokens": max_tokens,
                    "temperature": temperature
                },
                "trace_id": trace_id, "request_id": f"fm_rag_gen_{request_id}"
            }
            fm_response = await self.fm_client.generate_text(input_data=fm_request) # Call API proxy

            if fm_response.get("status") == "success":
                generated_text = fm_response.get("payload", {}).get("generated_text")
                if generated_text:
                    status = "success"; response_payload = {"generated_response": generated_text.strip()}
                    # Include metadata from FM call if available
                    response_metadata = fm_response.get("metadata", {})
                else:
                    status = "failed"; errors.append({"code": "GENERATION_EMPTY", "message": "FM Client API proxy returned empty response."})
            else:
                raise RuntimeError(f"FM Client API proxy failed: {fm_response.get('error_details')}")

        except Exception as e:
             status = "internal_error"; errors.append({"code": "GENERATION_ERROR", "message": str(e)}); logger.error(f"RAG: LLM generation via FM Client API proxy failed: {e}", exc_info=True)

        response = self._create_miz_oki_response(input_data, status, response_payload, errors if errors else None)
        # Add processing time, potentially merging with FM metadata if it exists
        proc_duration = (time.monotonic() - start_time) * 1000
        response["metadata"] = response_metadata # Start with FM metadata
        response["metadata"]["rag_tool_processing_duration_ms"] = proc_duration
        return response

# --- Context-Adaptive RL Base Class (Remains Sync Logic - Used by External ADK Agents) ---
class ContextAdaptiveRLBase:
    """ Base class for RL algorithms. Logic used by external ADK agents. """
    def __init__(self, state_dim, action_dim, learning_rate=0.001, gamma=0.99, epsilon=1.0, epsilon_decay=0.995, epsilon_min=0.01):
        self.state_dim = state_dim
        self.action_dim = action_dim
        self.learning_rate = learning_rate
        self.gamma = gamma # Discount factor
        self.epsilon = epsilon # Exploration rate
        self.epsilon_decay = epsilon_decay
        self.epsilon_min = epsilon_min
        self.model = self._build_network() # Requires implementation in subclass
        self.logger = logging.getLogger(self.__class__.__name__)

    def _build_network(self):
        """Builds the neural network model (e.g., using TensorFlow/Keras)."""
        # Placeholder - Subclasses must implement this based on the chosen RL algorithm (DQN, PPO, etc.)
        # Example using TF/Keras (requires tensorflow import):
        # try:
        #     import tensorflow as tf
        #     model = tf.keras.models.Sequential([
        #         tf.keras.layers.Dense(64, activation='relu', input_shape=(self.state_dim,)),
        #         tf.keras.layers.Dense(64, activation='relu'),
        #         tf.keras.layers.Dense(self.action_dim, activation='linear') # Output Q-values or policy probabilities
        #     ])
        #     model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=self.learning_rate), loss='mse') # Adjust loss based on algorithm
        #     return model
        # except ImportError:
        #      self.logger.error("TensorFlow not installed. Cannot build RL network.")
        #      return None
        raise NotImplementedError("Subclasses must implement _build_network.")

    def get_action(self, state, use_exploration=True):
        """Selects an action based on the current state (epsilon-greedy)."""
        if self.model is None: raise RuntimeError("RL model not built.")
        if use_exploration and np.random.rand() <= self.epsilon:
            return random.randrange(self.action_dim) # Explore
        else:
            # Ensure state is correctly shaped for the model (e.g., batch dimension)
            try:
                state_input = np.reshape(state, [1, self.state_dim])
                act_values = self.model.predict(state_input, verbose=0) # Exploit - Get Q-values or policy
                return np.argmax(act_values[0]) # Choose best action
            except Exception as e:
                 self.logger.error(f"Error during action prediction: {e}")
                 return random.randrange(self.action_dim) # Fallback to random action on error

    def remember(self, state, action, reward, next_state, done):
        """Stores experience in memory (usually managed by the agent/buffer)."""
        # This base class doesn't manage memory; the agent using it does.
        pass

    def train_batch(self, batch):
        """Trains the model on a batch of experiences."""
        if self.model is None: raise RuntimeError("RL model not built.")
        # Placeholder - Subclasses implement specific training logic (e.g., DQN update rule)
        # Example DQN logic:
        # try:
        #     states = np.array([experience[0] for experience in batch])
        #     actions = np.array([experience[1] for experience in batch])
        #     rewards = np.array([experience[2] for experience in batch])
        #     next_states = np.array([experience[3] for experience in batch])
        #     dones = np.array([experience[4] for experience in batch])
        #
        #     current_q_values = self.model.predict(states, verbose=0)
        #     next_q_values = self.model.predict(next_states, verbose=0)
        #
        #     target_q_values = current_q_values.copy()
        #
        #     for i in range(len(batch)):
        #         if dones[i]:
        #             target_q_values[i, actions[i]] = rewards[i]
        #         else:
        #             target_q_values[i, actions[i]] = rewards[i] + self.gamma * np.amax(next_q_values[i])
        #
        #     self.model.fit(states, target_q_values, epochs=1, verbose=0)
        #
        #     # Decay epsilon
        #     if self.epsilon > self.epsilon_min:
        #         self.epsilon *= self.epsilon_decay
        # except Exception as e:
        #      self.logger.error(f"Error during batch training: {e}")
        raise NotImplementedError("Subclasses must implement train_batch.")

    def load_weights(self, filepath):
        """Loads model weights from a file."""
        if self.model is None: raise RuntimeError("RL model not built.")
        try:
            self.model.load_weights(filepath)
            self.logger.info(f"Loaded model weights from {filepath}")
        except Exception as e:
            self.logger.error(f"Error loading weights from {filepath}: {e}")

    def save_weights(self, filepath):
        """Saves model weights to a file."""
        if self.model is None: raise RuntimeError("RL model not built.")
        try:
            # Ensure directory exists
            os.makedirs(os.path.dirname(filepath), exist_ok=True)
            self.model.save_weights(filepath)
            self.logger.info(f"Saved model weights to {filepath}")
        except Exception as e:
            self.logger.error(f"Error saving weights to {filepath}: {e}")

# --- Dynamic Expert Evolution (Remains Sync Logic - Used by MLOps/Monitoring) ---
class DynamicExpertEvolution:
    """ Logic for deciding MoE evolution actions. Triggered externally (e.g., by monitoring service). """
    def __init__(self, config: EnhancedConfig, moe_registry_proxy: MixtureOfExpertsRegistryManager):
        if not config or not moe_registry_proxy:
             raise InitializationError("DynamicExpertEvolution requires config and MoE Registry proxy.")
        self.config = config
        self.moe_registry = moe_registry_proxy # Use the proxy
        self.logger = logging.getLogger('MIZ-OKI.DynamicExpertEvolution')
        # TODO: Load thresholds, strategies from config more robustly
        self.performance_threshold = config.get("system_thresholds.expert_performance_threshold", 0.7) # Example config path
        self.improvement_threshold = config.get("system_thresholds.expert_improvement_threshold", 0.05)
        self.retirement_threshold = config.get("system_thresholds.expert_retirement_threshold", 0.5)
        self.min_evaluations = config.get("system_thresholds.expert_min_evaluations", 10)

    def _get_expert_performance_history(self, expert_id: str) -> List[Dict]:
        """ Placeholder: Fetches performance history for an expert (e.g., from monitoring DB/logs or MoE Registry API). """
        # In reality, query a database or monitoring system, or potentially the MoE Registry API if it stores history.
        self.logger.debug(f"Fetching performance history for expert {expert_id} (Placeholder).")
        # Simulate some history
        # This should ideally fetch actual metrics like accuracy, latency, cost per call etc.
        return [{"accuracy": random.uniform(self.retirement_threshold - 0.1, self.performance_threshold + 0.1), "timestamp": datetime.now().isoformat()} for _ in range(random.randint(5, 15))]

    def decide_evolution_actions(self, expert_id: str) -> List[Dict]:
        """Decides evolution actions based on expert performance history."""
        actions = []
        try:
            performance_history = self._get_expert_performance_history(expert_id)
            if len(performance_history) < self.min_evaluations:
                self.logger.info(f"Expert {expert_id}: Insufficient evaluations ({len(performance_history)}/{self.min_evaluations}) for evolution decision.")
                return actions

            # Analyze trends (e.g., average performance, improvement rate)
            recent_performance = [p.get('accuracy', 0) for p in performance_history[-5:]] # Look at last 5 evaluations
            avg_recent_perf = np.mean(recent_performance) if recent_performance else 0
            # --- TODO: Implement more sophisticated trend analysis (e.g., slope of performance over time) ---

            self.logger.info(f"Expert {expert_id}: Avg recent performance (accuracy) = {avg_recent_perf:.3f}")

            # --- Decision Logic ---
            if avg_recent_perf < self.retirement_threshold:
                actions.append({"action": "retire", "expert_id": expert_id, "reason": f"Performance ({avg_recent_perf:.3f}) below retirement threshold ({self.retirement_threshold:.3f})"})
            elif avg_recent_perf < self.performance_threshold:
                actions.append({"action": "retrain", "expert_id": expert_id, "reason": f"Performance ({avg_recent_perf:.3f}) below target threshold ({self.performance_threshold:.3f})"})
                # Optionally suggest distillation if performance is stagnant but acceptable
                # if trend_is_stagnant: actions.append({"action": "distill_new_student", ...})
            else:
                self.logger.info(f"Expert {expert_id} is performing adequately.")
                # Consider specialization or fine-tuning if performance is very high?

        except Exception as e:
             self.logger.error(f"Error deciding evolution for expert {expert_id}: {e}", exc_info=True)

        return actions

# --- Neural Processing Tool (Reworked Async - Delegates to FM Client API Proxy) ---
class NeuralProcessingTool:
    """ Handles embedding generation via FoundationModelClient API proxy. Deployed as a service. """
    def __init__(self, config: EnhancedConfig, fm_client_proxy: Any):
        if not config or not fm_client_proxy:
            raise InitializationError("NeuralProcessingTool requires config and FM Client proxy.")
        self.config = config
        self.fm_client = fm_client_proxy
        self.default_embedding_alias = config.foundation_models.defaults.llama4_embedding_model
        self.logger = logging.getLogger('MIZ-OKI.NeuralProcessingTool')
        self.logger.info("Neural Processing Tool logic initialized (Reworked - delegates to FM Client).")

    def _create_miz_oki_response(self, request_data: Dict, status: str, payload: Optional[Dict] = None, errors: Optional[List[Dict]] = None) -> Dict:
        """Helper to construct a standard MIZ OKI response."""
        return {
            "miz_oki_version": self.config.miz_oki_schema_version,
            "request_id": request_data.get("request_id"), "trace_id": request_data.get("trace_id"),
            "workflow_execution_id": request_data.get("workflow_execution_id"), "step_id": request_data.get("step_id"),
            "timestamp": datetime.now(datetime.timezone.utc).isoformat(),
            "source_component": "NeuralProcessingTool", "target_component": request_data.get("source_component"),
            "status": status, "payload": payload, "error_details": errors, "metadata": {}
        }

    async def get_embedding(self, input_data: Dict[str, Any]) -> Dict[str, Any]:
        """ Gets embedding via FM Client API proxy. Expects/Returns MIZ OKI. """
        start_time = time.monotonic(); errors = []
        # Parse MIZ OKI input
        payload = input_data.get("payload", {}); data = payload.get("data"); model_alias = payload.get("model_alias")
        trace_id = input_data.get("trace_id"); request_id = input_data.get("request_id")

        if data is None: errors.append({"code": "MISSING_DATA", "message": "'payload.data' (str or List[str]) is required."})
        if not self.fm_client: errors.append({"code": "MISSING_DEPENDENCY", "message": "FM Client Tool proxy unavailable."})
        if errors:
            response = self._create_miz_oki_response(input_data, "bad_request", errors=errors)
            response["metadata"]["processing_duration_ms"] = (time.monotonic() - start_time) * 1000
            return response

        target_model_alias = model_alias or self.default_embedding_alias
        status = "pending"; response_payload = None; response_metadata = {}

        try:
            # Prepare MIZ OKI request for FM Client Tool API proxy
            fm_request = {
                "payload": {"data": data, "model_alias": target_model_alias},
                "trace_id": trace_id, "request_id": f"fm_embed_{request_id}"
            }
            fm_response = await self.fm_client.generate_embedding(input_data=fm_request) # Call API proxy

            if fm_response.get("status") == "success":
                embedding_result = fm_response.get("payload", {}).get("embedding")
                if embedding_result is not None:
                    status = "success"; response_payload = {"embedding": embedding_result}
                    # Include metadata from FM call if available
                    response_metadata = fm_response.get("metadata", {})
                else:
                    status = "failed"; errors.append({"code": "EMBEDDING_EMPTY", "message": "FM Client API proxy returned no embedding."})
            else:
                raise RuntimeError(f"FM Client API proxy failed for embedding: {fm_response.get('error_details')}")

        except Exception as e:
             status = "internal_error"; errors.append({"code": "EMBEDDING_ERROR", "message": str(e)}); logger.error(f"Async embedding via FM Client proxy failed: {e}", exc_info=True)

        response = self._create_miz_oki_response(input_data, status, response_payload, errors if errors else None)
        # Add processing time, potentially merging with FM metadata if it exists
        proc_duration = (time.monotonic() - start_time) * 1000
        response["metadata"] = response_metadata # Start with FM metadata
        response["metadata"]["nn_tool_processing_duration_ms"] = proc_duration
        return response

    async def batch_embed(self, input_data: Dict[str, Any]) -> Dict[str, Any]:
         """ Batch embedding via FM Client API proxy. Expects/Returns MIZ OKI. """
         # The logic is identical to get_embedding because the FM Client's
         # generate_embedding method is designed to handle both single string and list input.
         payload = input_data.get("payload", {})
         if "data" in payload and not isinstance(payload["data"], list):
              # If single item passed to batch endpoint, wrap it in a list for FM client
              payload["data"] = [payload["data"]]
              input_data["payload"] = payload # Update input_data
              logger.debug("Wrapped single item in list for batch_embed call to FM Client.")

         # Call the same underlying method
         return await self.get_embedding(input_data)

# --- R2 Reasoning Tool (Reworked Async) ---
class R2ReasoningTool:
    """ Implements step-by-step reasoning async, calling deployed Tool APIs via proxies. Deployed as a service. """
    def __init__(self, config: EnhancedConfig, kg_tool_proxy: Any, hde_tool_proxy: Any, fm_client_proxy: Any):
        if not all([config, kg_tool_proxy, hde_tool_proxy, fm_client_proxy]):
            raise InitializationError("R2ReasoningTool requires config and proxies for KG, HDE, and FM tools.")
        self.config = config
        self.kg_tool = kg_tool_proxy
        self.hde_tool = hde_tool_proxy
        self.fm_client = fm_client_proxy
        self.reasoning_templates: Dict[str, Dict] = {}
        # TODO: Replace deque with persistent storage (e.g., Firestore, BQ) for production
        self.reasoning_history = deque(maxlen=1000)
        self.logger = logging.getLogger('MIZ-OKI.R2ReasoningTool')
        self._load_templates() # Load templates synchronously during init
        self.logger.info("R2 Reasoning Tool logic initialized (Reworked).")

    def _load_templates(self):
        """ Loads reasoning templates (e.g., from GCS, DB, or embedded). """
        # --- TODO: Load templates from a persistent/configurable source ---
        # Example template structure with async step calling KG Tool API proxy:
        async def _get_campaign_details_step_api(variables: Dict, kg_tool: Any, **kwargs) -> Dict:
             """Async step function calling KG Tool API proxy."""
             campaign_id = variables.get("campaign_id")
             trace_id = kwargs.get("trace_id")
             if not campaign_id: return {"_info": "Skipped: campaign_id missing", "campaign_details": None}
             try:
                 # Prepare MIZ OKI request for KG Tool
                 kg_request = {"payload": {"mizId": campaign_id}, "trace_id": trace_id, "request_id": f"r2_kg_get_{campaign_id}"}
                 kg_response = await kg_tool.get_entity_endpoint(request=kg_request) # Call API proxy method
                 if kg_response.get("status") == "success":
                     details = kg_response.get("payload", {}).get("entity_data")
                     return {"_info": f"Fetched details for {campaign_id} via KG API", "campaign_details": details or {}}
                 else:
                     logger.warning(f"KG API get_entity failed for {campaign_id}: {kg_response.get('error_details')}")
                     return {"_info": f"Failed fetch for {campaign_id}", "campaign_details": None, "_error": True}
             except Exception as e:
                  logger.error(f"Error in _get_campaign_details_step_api for {campaign_id}: {e}")
                  return {"_info": f"Exception fetch for {campaign_id}", "campaign_details": None, "_error": True}

        self.reasoning_templates = {
            "analyze_campaign_perf": {
                "description": "Analyzes campaign performance using details and LLM reasoning.",
                "variables": ["campaign_id", "budget"], # Input variables expected
                "steps": [
                    {
                        "id": "fetch_details",
                        "logic": _get_campaign_details_step_api, # Use the async function calling the API proxy
                        "description": "Fetch campaign details from Knowledge Graph."
                    },
                    {
                        "id": "compare_spend",
                        # Simple sync logic is fine here, will be run in thread by default if needed
                        "logic": lambda vars, **kwargs: {"spend_vs_budget": vars.get('campaign_details',{}).get('spend',0) - vars.get('budget',0)},
                        "description": "Compare actual spend against budget."
                    },
                    {
                        "id": "llm_reason",
                        # Define structured logic for calling FM Client Tool API proxy
                        "logic": {
                            "type": "fm_generate", # Custom type identifier
                            "model_alias": self.config.foundation_models.defaults.llama4_scout, # Fetch from config
                            "output_variable": "llm_analysis",
                            "prompt_template": "Analyze campaign performance based on the following data:\nCampaign ID: {campaign_id}\nBudget: {budget}\nSpend Difference: {spend_vs_budget}\nDetails:\n{campaign_details}\n\nProvide a concise analysis (2-3 sentences):"
                        },
                        "description": "Use LLM to provide a qualitative analysis."
                    }
                    # Add more steps (e.g., call HDE for recommendation)
                ],
                "conclusion_template": "Analysis for Campaign {campaign_id}: {llm_analysis}" # Template for final output
            }
            # Add more templates...
        }
        self.logger.info(f"Loaded {len(self.reasoning_templates)} R2 reasoning templates (Example).")

    def _create_miz_oki_response(self, request_data: Dict, status: str, payload: Optional[Dict] = None, errors: Optional[List[Dict]] = None) -> Dict:
        """Helper to construct a standard MIZ OKI response."""
        return {
            "miz_oki_version": self.config.miz_oki_schema_version,
            "request_id": request_data.get("request_id"), "trace_id": request_data.get("trace_id"),
            "workflow_execution_id": request_data.get("workflow_execution_id"), "step_id": request_data.get("step_id"),
            "timestamp": datetime.now(datetime.timezone.utc).isoformat(),
            "source_component": "R2ReasoningTool", "target_component": request_data.get("source_component"),
            "status": status, "payload": payload, "error_details": errors, "metadata": {}
        }

    async def reason(self, input_data: Dict[str, Any]) -> Dict[str, Any]:
        """ Perform step-by-step reasoning async. Expects/Returns MIZ OKI. """
        start_time = time.monotonic(); errors = []
        # Parse MIZ OKI input
        payload = input_data.get("payload", {}); template_id = payload.get("template_id"); task_input_data = payload.get("input_data", {})
        trace_id = input_data.get("trace_id"); request_id = input_data.get("request_id")

        if not template_id or template_id not in self.reasoning_templates:
            errors.append({"code": "TEMPLATE_NOT_FOUND", "message": f"Reasoning template '{template_id}' not found."})
            response = self._create_miz_oki_response(input_data, "bad_request", errors=errors)
            response["metadata"]["processing_duration_ms"] = (time.monotonic() - start_time) * 1000
            return response

        template = self.reasoning_templates[template_id]
        reasoning_id = f"r2_{template_id}_{uuid.uuid4().hex[:8]}"
        status = "pending"; response_payload = None
        log_entry = {
            "reasoning_id": reasoning_id, "template_id": template_id, "trace_id": trace_id,
            "input_preview": str(task_input_data)[:200], "status": "running",
            "steps_executed": [], "variables_state": {}, "conclusion": None,
            "chain_of_thought": [] # Detailed log of execution
        }
        # Initialize variables from input data based on template definition
        variables = {var: task_input_data.get(var) for var in template.get("variables", []) if var in task_input_data}
        log_entry["variables_state"]["initial"] = variables.copy()
        cot = log_entry["chain_of_thought"]
        cot.append(f"[{datetime.now(datetime.timezone.utc).isoformat()}] START Reasoning: {template_id}. Initial Vars: {variables}")

        try:
            for i, step_config in enumerate(template["steps"]):
                step_start = time.monotonic()
                step_id = step_config.get("id", f"step_{i+1}")
                step_desc = step_config.get("description", "No description")
                step_log = {"step_id": step_id, "description": step_desc, "status": "pending"}
                cot.append(f"[{datetime.now(datetime.timezone.utc).isoformat()}] Executing Step: {step_id} ({step_desc})...")

                try:
                    logic = step_config.get("logic")
                    step_output_vars = {}

                    # --- Execute Logic based on type ---
                    if isinstance(logic, dict) and 'type' in logic:
                        logic_type = logic['type']
                        cot.append(f"  - Logic Type: Structured ({logic_type})")
                        # --- Call Tool APIs via Proxies ---
                        if logic_type == 'fm_generate' and self.fm_client:
                            prompt_template = logic["prompt_template"]
                            model_alias = logic.get("model_alias", self.config.foundation_models.defaults.llama4_scout)
                            output_var = logic["output_variable"]
                            # Safely format prompt using available variables
                            prompt = prompt_template.format_map(defaultdict(lambda: 'N/A', variables))
                            cot.append(f"  - Calling FM Client API (Model: {model_alias})")
                            fm_request = {"payload": {"prompt": prompt, "model_alias": model_alias, "max_tokens": logic.get("max_tokens", 512)}, "trace_id": trace_id, "request_id": f"r2_fm_{step_id}"}
                            fm_response = await self.fm_client.generate_text(input_data=fm_request) # Call API proxy
                            if fm_response.get("status") == "success":
                                step_output_vars[output_var] = fm_response.get("payload", {}).get("generated_text")
                                cot.append(f"  - FM Response stored in '{output_var}'.")
                            else: raise RuntimeError(f"FM Client API call failed: {fm_response.get('error_details')}")

                        elif logic_type == 'kg_query' and self.kg_tool:
                            query_template = logic["query_template"]
                            output_var = logic["output_variable"]
                            query = query_template.format_map(defaultdict(lambda: 'N/A', variables))
                            cot.append(f"  - Calling KG Tool API (Query: {query[:50]}...)")
                            kg_request = {"payload": {"query": query, "parameters": logic.get("parameters", {})}, "trace_id": trace_id, "request_id": f"r2_kg_{step_id}"} # Pass params if needed
                            kg_response = await self.kg_tool.execute_query(request=kg_request) # Call API proxy
                            if kg_response.get("status") == "success":
                                step_output_vars[output_var] = kg_response.get("payload", {}).get("results", [])
                                cot.append(f"  - KG Response stored in '{output_var}'.")
                            else: raise RuntimeError(f"KG Tool API call failed: {kg_response.get('error_details')}")

                        elif logic_type == 'hde_call' and self.hde_tool:
                            decision_type = logic["decision_type"]
                            context_vars = logic.get("context_variables", list(variables.keys()))
                            output_var = logic["output_variable"]
                            hde_context = {k: variables[k] for k in context_vars if k in variables}
                            cot.append(f"  - Calling HDE Tool API (Decision: {decision_type})")
                            hde_request = {"payload": {"decision_type": decision_type, "context": hde_context}, "trace_id": trace_id, "request_id": f"r2_hde_{step_id}"}
                            hde_response = await self.hde_tool.make_decision(input_data=hde_request) # Call API proxy
                            if hde_response.get("status") in ["success", "ethics_review_required", "partial_success"]: # Handle various success states
                                step_output_vars[output_var] = hde_response.get("payload", {}) # Store full HDE log
                                cot.append(f"  - HDE Response stored in '{output_var}'. Status: {hde_response.get('status')}")
                            else: raise RuntimeError(f"HDE Tool API call failed: {hde_response.get('error_details')}")

                        else:
                            raise ValueError(f"Unsupported structured logic type or missing tool proxy: {logic_type}")

                    elif callable(logic): # Execute custom async or sync function
                        cot.append(f"  - Logic Type: Callable Function ({logic.__name__})")
                        # Pass proxies to the function
                        tool_proxies = {"kg_tool": self.kg_tool, "hde_tool": self.hde_tool, "fm_client": self.fm_client, "trace_id": trace_id}
                        if asyncio.iscoroutinefunction(logic):
                            step_result = await logic(variables, **tool_proxies)
                        else:
                            # Run sync function in thread pool
                            step_result = await asyncio.to_thread(logic, variables, **tool_proxies)

                        if step_result and isinstance(step_result, dict):
                            step_output_vars = {k:v for k,v in step_result.items() if not k.startswith('_')}
                            step_log["info"] = step_result.get("_info", "Callable executed.")
                            if step_result.get("_error"): raise RuntimeError(step_log["info"])
                        else:
                            logger.warning(f"R2 Step {step_id}: Callable logic returned unexpected type {type(step_result)}")
                            step_log["info"] = "Callable returned non-dict or None."
                    else:
                        raise TypeError(f"Invalid logic type for step {step_id}: {type(logic)}")

                    # Update variables and log step success
                    variables.update(step_output_vars)
                    step_log["status"] = "success"
                    step_log["output_preview"] = {k: str(v)[:100] + ('...' if len(str(v)) > 100 else '') for k, v in step_output_vars.items()} # Preview output
                    cot.append(f"  - Step Success. Output Keys: {list(step_output_vars.keys())}")

                except Exception as step_e:
                    logger.error(f"R2 Step {step_id} FAILED: {step_e}", exc_info=True)
                    step_log["status"] = "failed"; step_log["error"] = str(step_e)
                    log_entry["steps_executed"].append(step_log)
                    cot.append(f"  - Step FAILED: {step_e}")
                    raise step_e # Propagate error to stop reasoning chain

                step_log["duration_ms"] = (time.monotonic() - step_start) * 1000
                log_entry["steps_executed"].append(step_log)
                log_entry["variables_state"][f"after_{step_id}"] = variables.copy() # Log state after step

            # --- Generate Conclusion ---
            conclusion_template = template.get("conclusion_template")
            conclusion = None
            if conclusion_template:
                try:
                    conclusion = conclusion_template.format_map(defaultdict(lambda: 'N/A', variables))
                    cot.append(f"[{datetime.now(datetime.timezone.utc).isoformat()}] Generated Conclusion: {str(conclusion)[:100]}...")
                except KeyError as fmt_e:
                    conclusion = f"Conclusion generation failed: Missing key {fmt_e}"
                    cot.append(f"  - ERROR generating conclusion: {fmt_e}")
                    errors.append({"code": "CONCLUSION_FORMAT_ERROR", "message": str(fmt_e)})
                    status = "partial_success" # Mark as partial if conclusion fails
            else:
                cot.append(f"[{datetime.now(datetime.timezone.utc).isoformat()}] No conclusion template defined.")
                conclusion = variables # Return final variables if no template

            log_entry["conclusion"] = conclusion
            log_entry["status"] = "success" if status == "pending" else status # Update status if not already set to partial
            status = log_entry["status"]
            response_payload = log_entry # Return the full log entry as payload

        except Exception as e:
             status = "internal_error"; errors.append({"code": "R2_ERROR", "message": str(e)}); log_entry["status"] = "failed"; log_entry["error"] = str(e); cot.append(f"[{datetime.now(datetime.timezone.utc).isoformat()}] PROCESS FAILED: {e}")
             logger.error(f"R2 Reasoning failed for template '{template_id}' (ID: {reasoning_id}): {e}", exc_info=True)
             response_payload = log_entry # Return log even on failure

        log_entry["total_duration_ms"] = (time.monotonic() - start_time) * 1000
        self.reasoning_history.append(log_entry) # Add to in-memory history
        # --- TODO: Persist log_entry async via KG Tool API or other logging service ---
        # if self.kg_tool:
        #     asyncio.create_task(self.kg_tool.save_reasoning_log(log_entry)) # Conceptual method
        # --- End TODO ---

        response = self._create_miz_oki_response(input_data, status, response_payload, errors if errors else None)
        response["metadata"]["processing_duration_ms"] = log_entry["total_duration_ms"]
        return response

# --- Initialization (Conceptual - Tools instantiated by framework/orchestrator) ---
# semantic_rag_tool: Optional[SemanticGraphRAGTool] = None
# r2_reasoning_tool: Optional[R2ReasoningTool] = None
# nn_tool_proxy: Optional[NeuralProcessingToolDelegate] = None # Use the delegate

# async def initialize_technical_flows():
#     global semantic_rag_tool, r2_reasoning_tool, nn_tool_proxy
#     if not _config_obj or not _real_dependencies:
#         logger.critical("Cannot initialize Technical Flows: Config or dependencies missing.")
#         return
#     try:
#         # NN Tool is just a delegate around FM Client proxy
#         nn_tool_proxy = NeuralProcessingToolDelegate(_fm_client_proxy)
#         semantic_rag_tool = SemanticGraphRAGTool(_config_obj, _kg_tool_proxy, _fm_client_proxy, nn_tool_proxy)
#         r2_reasoning_tool = R2ReasoningTool(_config_obj, _kg_tool_proxy, _hde_tool_proxy, _fm_client_proxy)
#         logger.info("Technical Flow Tools initialized.")
#     except Exception as e:
#         logger.critical(f"Technical Flow Tools initialization failed: {e}", exc_info=True)
#         semantic_rag_tool = None; r2_reasoning_tool = None; nn_tool_proxy = None

print("\n--- MIZ 3.0 Technical Flows Layer Logic (Cell 6 - Reworked) ---")
print("RAG, R2, NN Tools refactored as async services handling MIZ OKI payloads.")
print("Uses real dependencies/proxies via MIZ OKI APIs.")
print("RL/MoE Base classes unchanged; logic used by external ADK Agents/MLOps.")
print("----------------------------------------------------------------")

SyntaxError: invalid syntax (<ipython-input-10-1f907cc6ac45>, line 63)

In [11]:
# Cell 7: Business Applications Layer (Reworked)
# Status: AWE Service uses real Vertex AI Client proxies (if available).
#         App Tools (BEAB, HP, RTB, AdOpt) use real dependencies/proxies via MIZ OKI APIs.
#         External API tools remain critical placeholders. Logic within App tools needs full implementation.

import logging
import time
import datetime
import random
import json
import uuid
import asyncio
import re # Added for AWE
import hashlib # Added for Pseudonymizer
from typing import Dict, Any, Optional, List, Union, Callable, Tuple # Added Tuple
import aiohttp # For API proxies

# --- GCP Client Libraries & Dependencies ---
# Needed for AWE Service interacting with Vertex AI Workflows
try:
   from google.cloud import workflows_v1
   from google.cloud.workflows import executions_v1
   from google.cloud.workflows.executions_v1.types import Execution
   from google.protobuf import json_format, field_mask_pb2
   from google.api_core import exceptions as google_api_exceptions
   VERTEX_WORKFLOWS_SDK_AVAILABLE = True
   # Assume clients are initialized elsewhere and passed in
   logger.debug("Successfully imported google-cloud-workflows libraries for Cell 7.")
except ImportError:
   logger.warning("google-cloud-workflows library not found. AWE service implementation is limited.")
   VERTEX_WORKFLOWS_SDK_AVAILABLE = False
   # Dummy classes from Cell 16 reformation (or Cell 9)
   class ExecutionState: ACTIVE="ACTIVE"; SUCCEEDED="SUCCEEDED"; FAILED="FAILED"; CANCELLED="CANCELLED"; SUSPENDED="SUSPENDED"
   class DummyProto: pass
   class workflows_v1: class WorkflowsAsyncClient: pass; class Workflow: pass; class GetWorkflowRequest: pass; class UpdateWorkflowRequest: pass; class CreateWorkflowRequest: pass
   class executions_v1: ExecutionState = ExecutionState; class ExecutionsAsyncClient: pass; class Execution(DummyProto): pass; class CreateExecutionRequest: pass; class GetExecutionRequest: pass; class CancelExecutionRequest: pass; class ListExecutionsRequest: pass; class ListExecutionsResponse: pass; ExecutionView = type('Enum', (), {'BASIC': 1, 'FULL': 2})()
   class json_format: @staticmethod def ParseDict(*args, **kwargs): pass; @staticmethod def MessageToDict(msg, **kwargs): return getattr(msg, '_fields', {})
   class google_api_exceptions: class NotFound(Exception): pass; class InvalidArgument(Exception): pass; class PermissionDenied(Exception): pass; class FailedPrecondition(Exception):pass; class GoogleAPIError(Exception): pass
   class field_mask_pb2: class FieldMask: pass

# --- Assume Real Tool/Client Dependencies are Injected/Available ---
# These proxies represent API clients for other deployed MIZ OKI services or GCP clients.
try:
    # Use CONFIG_OBJ loaded in Cell 1
    if 'CONFIG_OBJ' not in globals() or not CONFIG_OBJ: raise NameError("CONFIG_OBJ not found or is None")
    if not isinstance(CONFIG_OBJ, EnhancedConfig): raise NameError("CONFIG_OBJ is not an EnhancedConfig instance")

    # Proxies for other MIZ OKI Tool APIs
    if 'kg_tool_service_instance' not in globals(): raise NameError("kg_tool_service_instance proxy not found") # Cell 3 Proxy
    if 'moe_registry_manager' not in globals(): raise NameError("moe_registry_manager not found") # Cell 4 Instance/Proxy
    if 'foundation_model_client' not in globals(): raise NameError("foundation_model_client proxy not found") # Cell 18 Proxy
    if 'expert_invoker' not in globals(): raise NameError("expert_invoker proxy not found") # Needs definition/mock
    if 'xai' not in globals(): raise NameError("xai (ExplainableAI instance/proxy) not found") # Cell 11 Proxy

    # Real/Mock Clients for GCP Services
    if '_workflow_executions_client' not in globals(): raise NameError("_workflow_executions_client not found") # Cell 16 Client Proxy
    if '_workflows_client' not in globals(): raise NameError("_workflows_client not found") # Cell 7 needs this

    _config_obj = CONFIG_OBJ
    _kg_tool_proxy = kg_tool_service_instance
    _moe_registry_proxy = moe_registry_manager
    _expert_invoker_proxy = expert_invoker
    _fm_client_proxy = foundation_model_client
    _xai_proxy = xai
    _workflow_executions_client_proxy = _workflow_executions_client # Use client proxy from Cell 16
    _workflows_client_proxy = _workflows_client # Use client proxy

    # NN Tool delegates to FM Client (from Cell 6)
    class NeuralProcessingToolDelegate:
        def __init__(self, fm_client_proxy: Any): self.fm_client = fm_client_proxy; self.logger = logging.getLogger('MIZ-OKI.NNToolDelegate');
        async def get_embedding(self, input_data: Dict): return await self.fm_client.generate_embedding(input_data=input_data)
    _nn_tool_proxy = NeuralProcessingToolDelegate(_fm_client_proxy)

    _real_dependencies = True
    logger.debug("Using real/conceptual dependencies in Cell 7 (Reworked).")

except NameError as e:
    logger.warning(f"Dependency Error in Cell 7 ({e}). Using Mocks/Placeholders.")
    _real_dependencies = False
    # --- Mock/Placeholder Setup ---
    class MockKGTool: async def execute_query(self, request): await asyncio.sleep(0.01); return {"status": "success", "payload": {"results": [{'value': 0.7}]}}; async def get_entity(self, request): await asyncio.sleep(0.01); return {"status": "success", "payload": {"entity_data": {}}}
    class MockMoERegistryManager: async def find_expert_for_task(self, *args, **kwargs): await asyncio.sleep(0.01); return "mock_id"; async def get_expert_details(self, *args,**kwargs): await asyncio.sleep(0.01); return {"endpoint":"http://mock"}
    class MockFMClientTool: async def generate_text(self, input_data): await asyncio.sleep(0.01); return {"status": "success", "payload": {"generated_text": "```yaml\nsteps:\n- mock_step: ...\n```"}}; async def generate_embedding(self, input_data): await asyncio.sleep(0.01); return {"status": "success", "payload": {"embedding": [0.1]}}
    class MockExpertInvoker: async def invoke(self, request): await asyncio.sleep(0.01); return {"status": "success", "payload": {"prediction": [5.0], "recommendations": ["ITEM_1"]}}
    class MockNNTool: async def get_embedding(self, input_data): await asyncio.sleep(0.01); return {"status": "success", "payload": {"embedding": [0.1]}}
    class MockXAI: async def record_decision(self, request): await asyncio.sleep(0.01); pass
    class MockVertexClient: # Simplified mock for both workflow/execution
        async def create_execution(self, request): await asyncio.sleep(0.01); return MagicMock(name=f"{request.parent}/executions/{uuid.uuid4().hex[:8]}")
        async def get_execution(self, request): await asyncio.sleep(0.01); return MagicMock(state=ExecutionState.SUCCEEDED, result='{"mock": "result"}', _pb={})
        async def create_workflow(self, request): await asyncio.sleep(0.01); return MagicMock(result=lambda timeout: True) # Simulate LRO
        async def update_workflow(self, request): await asyncio.sleep(0.01); return MagicMock(result=lambda timeout: True) # Simulate LRO
        async def get_workflow(self, request): await asyncio.sleep(0.01); return MagicMock(source_contents='current_source_code')
    # Define minimal config if needed
    if 'CONFIG_OBJ' not in globals() or not CONFIG_OBJ:
        from dataclasses import dataclass, field
        @dataclass class MockGcpConfig: project_id:Optional[str]="mock-proj"; region:str="mock-region"; miz_salt:str="mock_salt"
        @dataclass class MockKgConfig: vector_db_type:str="none"
        @dataclass class MockFmDefaults: llama4_scout: str = "mock-scout"; llama4_maverick: str = "mock-mav"
        @dataclass class MockFmConfig: defaults: MockFmDefaults = field(default_factory=MockFmDefaults)
        @dataclass class MockBusinessImpact: roas_target: float = 8.0
        @dataclass class ServiceEndpointsConfig: ads_platform_api_endpoint: Optional[str]=None; crm_api_endpoint: Optional[str]=None; moe_registry_api_endpoint: Optional[str]=None; expert_invoker_api_endpoint: Optional[str]=None; kg_tool_api_endpoint: Optional[str]=None
        @dataclass class MockConfig: gcp: MockGcpConfig=field(default_factory=MockGcpConfig); kg:MockKgConfig=field(default_factory=MockKgConfig); foundation_models: MockFmConfig = field(default_factory=MockFmConfig); business_impact: MockBusinessImpact = field(default_factory=MockBusinessImpact); miz_salt: str = "mock_salt"; miz_oki_schema_version: str = "3.0"; service_endpoints: ServiceEndpointsConfig = field(default_factory=ServiceEndpointsConfig); def get(self, key, default=None): parts=key.split('.'); val=self; try: [val := getattr(val, p) for p in parts]; return val; except: return default
        _config_obj = MockConfig()

    _kg_tool_proxy = MockKGTool(); _moe_registry_proxy = MockMoERegistryManager(); _expert_invoker_proxy = MockExpertInvoker(); _fm_client_proxy = MockFMClientTool(); _nn_tool_proxy = MockNNTool(); _xai_proxy = MockXAI(); _workflow_executions_client_proxy = MockVertexClient(); _workflows_client_proxy = MockVertexClient()
    VERTEX_WORKFLOWS_SDK_AVAILABLE = False # Ensure mock status reflected
    # --- End Mock/Placeholder Setup ---

logger = logging.getLogger('MIZ-OKI.BusinessApplications')

# --- Placeholder External API Tools (Need Implementation as Services) ---
# These represent separate microservices deployed (e.g., Cloud Run) that handle
# the specifics of interacting with external platforms like Google Ads, Meta Ads, Shopify, Klaviyo.
# They accept and return MIZ OKI payloads.

class AdsPlatformApiToolProxy: # Renamed to Proxy
    """Proxy for the deployed Ads Platform Interaction Tool."""
    def __init__(self, config: EnhancedConfig, service_endpoint: Optional[str]):
        self.config = config
        self.endpoint = service_endpoint # Fetched from config or passed directly
        self.logger = logging.getLogger('MIZ-OKI.AdsPlatformApiToolProxy')
        self.session: Optional[aiohttp.ClientSession] = None
        if not self.endpoint: self.logger.error("Ads Platform API endpoint not configured!")

    async def initialize(self): self.session = aiohttp.ClientSession()
    async def cleanup(self):
        if self.session: await self.session.close()

    async def _make_request(self, action: str, payload: Dict, trace_id: Optional[str]) -> Dict:
        """Makes an async HTTP request to the deployed Ads Tool."""
        if not self.endpoint or not self.session: return {"status": "error", "error_details": [{"message": "Ads Tool endpoint/session not available"}]}
        url = f"{self.endpoint.rstrip('/')}/{action}" # e.g., /adjust_bid, /update_budget
        miz_oki_request = {
            "miz_oki_version": self.config.miz_oki_schema_version,
            "request_id": f"ads_{action}_{uuid.uuid4().hex[:8]}",
            "trace_id": trace_id,
            "source_component": "BusinessAppLayerProxy", # Identify caller
            "payload": payload
        }
        headers = {"Content-Type": "application/json"}
        # --- TODO: Add authentication header ---
        # Example using OIDC token for Cloud Run invoker:
        # token = await self._get_oidc_token(self.endpoint)
        # if token: headers["Authorization"] = f"Bearer {token}"
        # else: return {"status": "error", "error_details": [{"message": "Failed to get auth token for Ads Tool"}]}
        # --- End TODO ---
        try:
            async with self.session.post(url, json=miz_oki_request, headers=headers, timeout=30) as response:
                response.raise_for_status()
                return await response.json() # Assume Ads Tool returns MIZ OKI response
        except aiohttp.ClientResponseError as http_err:
            error_text = await http_err.text()
            self.logger.error(f"HTTP error calling Ads Tool API ({action}) Status {http_err.status}: {error_text}")
            return {"status": "error", "error_details": [{"code": "HTTP_ERROR", "status_code": http_err.status, "message": f"Ads Tool API Error: {error_text}"}]}
        except asyncio.TimeoutError:
             self.logger.error(f"Timeout calling Ads Tool API ({action}) at {url}")
             return {"status": "error", "error_details": [{"code": "TIMEOUT_ERROR", "message": "Timeout calling Ads Tool API"}]}
        except Exception as e:
            self.logger.error(f"Error calling Ads Tool API ({action}): {e}", exc_info=True)
            return {"status": "error", "error_details": [{"code": "INTERNAL_PROXY_ERROR", "message": str(e)}]}

    # --- TODO: Implement _get_oidc_token helper if needed ---
    # async def _get_oidc_token(self, audience): ...
    # --- End TODO ---

    async def adjust_bid(self, request: Dict) -> Dict: # Expects MIZ OKI request
         payload = request.get("payload", {})
         self.logger.info(f"[ADS TOOL PROXY] Calling adjust_bid for {payload.get('campaign_id')}")
         return await self._make_request("adjust_bid", payload, request.get("trace_id"))

    async def update_budget(self, request: Dict) -> Dict: # Expects MIZ OKI request
         payload = request.get("payload", {})
         self.logger.info(f"[ADS TOOL PROXY] Calling update_budget for {payload.get('campaign_id')}")
         return await self._make_request("update_budget", payload, request.get("trace_id"))

    async def create_ad_creative(self, request: Dict) -> Dict: # Expects MIZ OKI request
          payload = request.get("payload", {})
          self.logger.info(f"[ADS TOOL PROXY] Calling create_creative for {payload.get('campaign_id')}")
          # Example: Actual Ads tool might return {"status": "success", "payload": {"creative_id": "new_creative_123"}}
          return await self._make_request("create_creative", payload, request.get("trace_id"))

class CrmApiToolProxy: # Renamed to Proxy
    """Proxy for the deployed CRM Interaction Tool."""
    def __init__(self, config: EnhancedConfig, service_endpoint: Optional[str]):
        self.config = config
        self.endpoint = service_endpoint # Fetched from config or passed directly
        self.logger = logging.getLogger('MIZ-OKI.CrmApiToolProxy')
        self.session: Optional[aiohttp.ClientSession] = None
        if not self.endpoint: self.logger.error("CRM API endpoint not configured!")

    async def initialize(self): self.session = aiohttp.ClientSession()
    async def cleanup(self):
        if self.session: await self.session.close()

    async def _make_request(self, action: str, payload: Dict, trace_id: Optional[str]) -> Dict:
        """Makes an async HTTP request to the deployed CRM Tool."""
        if not self.endpoint or not self.session: return {"status": "error", "error_details": [{"message": "CRM Tool endpoint/session not available"}]}
        url = f"{self.endpoint.rstrip('/')}/{action}"
        miz_oki_request = {
            "miz_oki_version": self.config.miz_oki_schema_version,
            "request_id": f"crm_{action}_{uuid.uuid4().hex[:8]}",
            "trace_id": trace_id,
            "source_component": "BusinessAppLayerProxy",
            "payload": payload
        }
        headers = {"Content-Type": "application/json"}
        # TODO: Add authentication header
        try:
            async with self.session.post(url, json=miz_oki_request, headers=headers, timeout=30) as response:
                response.raise_for_status()
                return await response.json()
        except aiohttp.ClientResponseError as http_err:
            error_text = await http_err.text()
            self.logger.error(f"HTTP error calling CRM Tool API ({action}) Status {http_err.status}: {error_text}")
            return {"status": "error", "error_details": [{"code": "HTTP_ERROR", "status_code": http_err.status, "message": f"CRM Tool API Error: {error_text}"}]}
        except asyncio.TimeoutError:
             self.logger.error(f"Timeout calling CRM Tool API ({action}) at {url}")
             return {"status": "error", "error_details": [{"code": "TIMEOUT_ERROR", "message": "Timeout calling CRM Tool API"}]}
        except Exception as e:
            self.logger.error(f"Error calling CRM Tool API ({action}): {e}", exc_info=True)
            return {"status": "error", "error_details": [{"code": "INTERNAL_PROXY_ERROR", "message": str(e)}]}

    async def update_customer_segment(self, request: Dict) -> Dict: # Expects MIZ OKI request
         payload = request.get("payload", {})
         self.logger.info(f"[CRM TOOL PROXY] Calling update_segment for {payload.get('customer_id')}")
         # Example: Actual CRM tool might return {"status": "success", "payload": {"segment_updated": True, "customer_id": "..."}}
         return await self._make_request("update_segment", payload, request.get("trace_id"))

    async def trigger_email_campaign(self, request: Dict) -> Dict: # Expects MIZ OKI request
         payload = request.get("payload", {})
         self.logger.info(f"[CRM TOOL PROXY] Calling trigger_email for {payload.get('email')}")
         # Example: Actual CRM tool might return {"status": "success", "payload": {"campaign_triggered": True, "message_id": "..."}}
         return await self._make_request("trigger_email", payload, request.get("trace_id"))

# --- Data Pseudonymizer (from Cell 3 rework) ---
# Assume DataPseudonymizer class is defined as in Cell 3 rework

# --- Privacy Controls Tool (Reworked - Internal Use, Sync OK) ---
class PrivacyControlsTool:
    """ Implements data privacy policies internally based on configuration. """
    def __init__(self, config: EnhancedConfig):
        if not config: raise InitializationError("Config required for PrivacyControlsTool.")
        self.config = config
        # Load policies from config, default to requiring pseudonymization
        self.policies = config.get("privacy_policies", {"default": {"requires_pseudonymization": True, "allowed_fields": None}})
        self.pseudonymizer = DataPseudonymizer(config.miz_salt) # Assumes salt is loaded
        self.logger = logging.getLogger('MIZ-OKI.PrivacyControlsTool')
        self.logger.info("Privacy Controls Tool logic initialized.")

    def apply_policy(self, data: Union[Dict, List], source_profile_id: Optional[str] = None, target_profile_id: str = "default") -> Union[Dict, List]:
        """
        Apply privacy policy synchronously based on target profile.
        Handles pseudonymization and field filtering.
        """
        target_policy = self.policies.get(target_profile_id, self.policies.get("default", {}))
        requires_pseudo = target_policy.get("requires_pseudonymization", True)
        allowed_fields = target_policy.get("allowed_fields") # Should be a set or None

        if allowed_fields is not None and not isinstance(allowed_fields, set):
            allowed_fields = set(allowed_fields) # Ensure it's a set for efficient lookup

        processed_data = data
        try:
            # 1. Apply Pseudonymization
            if requires_pseudo:
                if isinstance(data, list):
                    processed_data = [self.pseudonymizer.pseudonymize_dict(item) if isinstance(item, dict) else item for item in data]
                elif isinstance(data, dict):
                    processed_data = self.pseudonymizer.pseudonymize_dict(data)
                # else: data is unchanged if not list/dict

            # 2. Apply Field Filtering (if applicable)
            if allowed_fields is not None:
                if isinstance(processed_data, list):
                    # Filter fields within each dictionary in the list
                    processed_data = [{k: v for k, v in item.items() if k in allowed_fields} if isinstance(item, dict) else item for item in processed_data]
                elif isinstance(processed_data, dict):
                    # Filter fields in the dictionary
                    processed_data = {k: v for k, v in processed_data.items() if k in allowed_fields}
                # else: filtering doesn't apply to non-dict/list data

            return processed_data
        except Exception as e:
            self.logger.error(f"Error applying privacy policy: {e}", exc_info=True)
            return data # Return original data on error

# --- Brand Equity-Aware Bidding Tool (Reworked Async) ---
class BrandEquityAwareBiddingTool:
    """ Optimizes bidding async. Deployed as a service callable via MIZ OKI API. """
    def __init__(self, config: EnhancedConfig, kg_tool_proxy: Any, moe_registry_proxy: Any, expert_invoker_proxy: Any, xai_proxy: Optional[Any] = None, ads_platform_tool_proxy: Optional[Any] = None):
        if not all([config, kg_tool_proxy, moe_registry_proxy, expert_invoker_proxy]):
            raise InitializationError("BEABTool requires config and proxies for KG, MoE Registry, and Expert Invoker.")
        self.config = config
        self.kg_tool = kg_tool_proxy
        self.moe_registry = moe_registry_proxy
        self.expert_invoker = expert_invoker_proxy
        self.xai = xai_proxy # Optional XAI proxy
        self.ads_platform_tool = ads_platform_tool_proxy # Optional Ads Platform proxy
        self.default_roas_fallback = 3.0 # Example fallback
        self.equity_weight = config.get("beab_equity_weight", 0.2)
        self.min_bid_threshold = config.get("rtb_min_bid_threshold", 0.01)
        self.logger = logging.getLogger('MIZ-OKI.BEABTool')
        self.logger.info("BEAB Tool logic initialized (Reworked).")

    def _create_miz_oki_response(self, request_data: Dict, status: str, payload: Optional[Dict] = None, errors: Optional[List[Dict]] = None) -> Dict:
        """Helper to construct a standard MIZ OKI response."""
        return {
            "miz_oki_version": self.config.miz_oki_schema_version,
            "request_id": request_data.get("request_id"), "trace_id": request_data.get("trace_id"),
            "workflow_execution_id": request_data.get("workflow_execution_id"), "step_id": request_data.get("step_id"),
            "timestamp": datetime.now(datetime.timezone.utc).isoformat(),
            "source_component": "BrandEquityAwareBiddingTool", "target_component": request_data.get("source_component"),
            "status": status, "payload": payload, "error_details": errors, "metadata": {}
        }

    async def _get_brand_equity_score(self, context: Dict, trace_id: Optional[str]) -> float:
        """ Fetches brand equity score via KG Tool API proxy. """
        # --- TODO: Define REAL Cypher query for brand equity ---
        query = """
        // Placeholder Query: Replace with actual logic based on KG schema
        // Example: Look for brand sentiment or loyalty scores related to context
        OPTIONAL MATCH (p:Product {id: $product_id})<-[:INTERACTED_WITH]-(u:User)-[:HAS_SENTIMENT]->(s:Sentiment)
        WITH avg(s.score) as avg_sentiment
        OPTIONAL MATCH (cust:Customer {id: $customer_id})
        WITH coalesce(avg_sentiment, 0.5) as sentiment_score, coalesce(cust.loyalty_tier, 1) as loyalty_tier
        RETURN sentiment_score * (1 + (loyalty_tier - 1) * 0.1) AS value // Combine metrics
        """
        params = {"product_id": context.get("product_id"), "customer_id": context.get("customer_id")} # Example parameters
        # --- End TODO ---
        default_equity = 0.7
        try:
            if not self.kg_tool: raise RuntimeError("KG Tool proxy unavailable.")
            kg_request = {"payload": {"query": query, "parameters": params}, "trace_id": trace_id, "request_id": f"kg_get_equity_{trace_id}"}
            kg_response = await self.kg_tool.execute_query(request=kg_request) # Call API proxy
            if kg_response.get("status") == "success" and (results := kg_response.get("payload", {}).get("results")) and results[0] is not None and 'value' in results[0]:
                return float(results[0]['value'])
            logger.warning(f"Failed to get brand equity from KG Tool API, using default {default_equity}. Response: {kg_response}")
            return default_equity
        except Exception as e:
            logger.error(f"KG Tool API call failed for equity: {e}", exc_info=True)
            return default_equity

    async def _get_roas_prediction(self, context: Dict, trace_id: Optional[str]) -> float:
        """ Gets ROAS prediction via MoE Registry/Invoker API proxies. """
        try:
            if not self.moe_registry or not self.expert_invoker: raise RuntimeError("MoE Registry or Expert Invoker proxy unavailable.")
            # Find the appropriate forecasting expert
            expert_id = await self.moe_registry.find_expert_for_task(task_type="forecasting", domain="roas", context=context) # MoE API Call via proxy
            if not expert_id:
                logger.warning(f"No ROAS forecaster expert found via MoE Registry, using fallback {self.default_roas_fallback}.")
                return self.default_roas_fallback

            expert_details = await self.moe_registry.get_expert_details(expert_id) # MoE API Call via proxy
            expert_endpoint = expert_details.get("endpoint") if expert_details else None
            if not expert_endpoint:
                logger.warning(f"Endpoint not found for ROAS expert '{expert_id}', using fallback {self.default_roas_fallback}.")
                return self.default_roas_fallback

            # Prepare input for the expert model
            # --- TODO: Define the actual features needed by the ROAS model ---
            model_input_payload = {"features": context.get("campaign_features", {})} # Example input
            # --- End TODO ---

            # Call Expert Invoker API proxy
            invoker_request = {
                "payload": {"endpoint": expert_endpoint, "data": model_input_payload},
                "trace_id": trace_id, "request_id": f"invoker_roas_{trace_id}"
            }
            invoker_response = await self.expert_invoker.invoke(request=invoker_request) # Call API proxy

            if invoker_response.get("status") == "success":
                result_payload = invoker_response.get("payload", {})
                # --- TODO: Adapt parsing based on actual expert model output format ---
                prediction = result_payload.get("prediction")
                if isinstance(prediction, list) and prediction: return float(prediction[0])
                elif isinstance(prediction, (int, float)): return float(prediction)
                # --- End TODO ---
                else: logger.warning(f"ROAS expert '{expert_id}' returned invalid prediction format: {prediction}, using fallback.")
            else:
                logger.warning(f"Expert Invoker API call failed for ROAS expert '{expert_id}': {invoker_response.get('error_details')}, using fallback.")

            return self.default_roas_fallback
        except Exception as e:
            logger.error(f"Async ROAS prediction via MoE/Invoker API failed: {e}", exc_info=True)
            return self.default_roas_fallback

    async def calculate_adjusted_bid(self, input_data: Dict[str, Any]) -> Dict[str, Any]:
        """ Calculates bid async. Expects/Returns MIZ OKI payload. """
        start_time = time.monotonic(); errors = []
        # Parse MIZ OKI input
        payload = input_data.get("payload", {}); base_bid = payload.get("base_bid", 0.05); context = payload.get("context", {})
        trace_id = input_data.get("trace_id"); request_id = input_data.get("request_id")

        # Basic input validation
        try: base_bid = float(base_bid)
        except (ValueError, TypeError): errors.append({"code": "INVALID_BASE_BID", "message": "'base_bid' must be a number."})
        if not isinstance(context, dict): errors.append({"code": "INVALID_CONTEXT", "message": "'context' must be a dictionary."})
        if errors:
            response = self._create_miz_oki_response(input_data, "bad_request", errors=errors)
            response["metadata"]["processing_duration_ms"] = (time.monotonic() - start_time) * 1000
            return response

        decision_id = f"beab_{context.get('campaign_id', 'unknown')}_{uuid.uuid4().hex[:8]}"
        status = "pending"; response_payload = None
        log_details = { # Details for logging and potential XAI record
            "decision_id": decision_id, "type": "bid_adjustment", "timestamp": datetime.now(datetime.timezone.utc).isoformat(),
            "context_preview": {k:v for k,v in context.items() if k in ['campaign_id', 'ad_group_id', 'keyword']}, # Log key context items
            "base_bid": base_bid, "status": "pending"
        }

        try:
            # Fetch predictions concurrently
            predicted_roas, brand_equity = await asyncio.gather(
                self._get_roas_prediction(context, trace_id),
                self._get_brand_equity_score(context, trace_id)
            )

            # Calculate adjustment factors
            roas_target = self.config.business_impact.roas_target
            roas_factor = 1.0 + (predicted_roas - roas_target) / roas_target if roas_target > 0 else 1.0
            equity_factor = 1.0 + (brand_equity - 0.7) * self.equity_weight # Adjust weight based on business strategy

            # Calculate final bid, ensuring it meets minimum threshold
            adjusted_bid = max(self.min_bid_threshold, base_bid * roas_factor * equity_factor)

            log_details.update({
                "predicted_roas": predicted_roas, "brand_equity": brand_equity,
                "roas_target": roas_target, "roas_factor": roas_factor,
                "equity_weight": self.equity_weight, "equity_factor": equity_factor,
                "adjusted_bid": adjusted_bid, "status": "success"
            })
            status = "success"
            response_payload = log_details # Return the calculation details

            # Record decision via XAI Tool API proxy (fire-and-forget)
            if self.xai and hasattr(self.xai, 'record_decision'):
                 xai_record = {
                     "decision_id": decision_id, "component": "BrandEquityAwareBiddingTool",
                     "timestamp": log_details["timestamp"],
                     "context": context, # Full context
                     "inputs": {"base_bid": base_bid},
                     "decision": {"adjusted_bid": adjusted_bid},
                     "outputs": log_details, # Include intermediate calculations
                     "trace_id": trace_id
                 }
                 xai_request = {"payload": {"record": xai_record}, "trace_id": trace_id}
                 asyncio.create_task(self.xai.record_decision(request=xai_request)) # Call API proxy async

        except Exception as e:
            status = "internal_error"; errors.append({"code": "BID_CALC_ERROR", "message": str(e)})
            log_details.update({"status": "failed", "error": str(e), "adjusted_bid": base_bid}) # Log failure, return base bid
            logger.error(f"BEAB Tool: Failed to calculate adjusted bid: {e}", exc_info=True)
            response_payload = log_details # Return log even on failure

        response = self._create_miz_oki_response(input_data, status, response_payload, errors if errors else None)
        response["metadata"]["processing_duration_ms"] = (time.monotonic() - start_time) * 1000
        # --- TODO: Persist log_details via KG Tool API or logging service if needed beyond XAI record ---
        return response

    async def execute_bid_adjustment(self, input_data: Dict[str, Any]) -> Dict[str, Any]:
         """ Executes bid adjustment via Ads Platform Tool API proxy. Expects/Returns MIZ OKI. """
         start_time = time.monotonic(); errors = []
         # Parse MIZ OKI input
         payload = input_data.get("payload", {}); bid_details = payload.get("bid_details", {}) # Expects dict like {"platform": "google_ads", "campaign_id": "...", "ad_group_id": "...", "new_bid": 1.23}
         trace_id = input_data.get("trace_id"); request_id = input_data.get("request_id")

         if not self.ads_platform_tool: errors.append({"code": "MISSING_DEPENDENCY", "message": "Ads Platform Tool proxy unavailable."})
         if not isinstance(bid_details, dict) or not bid_details.get("new_bid") or not bid_details.get("platform"):
             errors.append({"code": "MISSING_DATA", "message": "Missing required bid details (platform, new_bid, identifiers) in payload."})
         if errors:
             response = self._create_miz_oki_response(input_data, "bad_request", errors=errors)
             response["metadata"]["processing_duration_ms"] = (time.monotonic() - start_time) * 1000
             return response

         status = "pending"; response_payload = None
         try:
              # Call Ads Platform Tool API proxy
              ads_request = {
                  "payload": bid_details, # Pass details needed by the Ads tool
                  "trace_id": trace_id, "request_id": f"ads_exec_bid_{request_id}"
              }
              ads_response = await self.ads_platform_tool.adjust_bid(request=ads_request) # Call proxy method

              if ads_response.get("status") == "success":
                   status = "success"
                   response_payload = ads_response.get("payload", {}) # Return payload from Ads tool
                   logger.info(f"Successfully executed bid adjustment via Ads Tool API proxy for {bid_details.get('campaign_id')}.")
              else:
                  raise RuntimeError(f"Ads Platform Tool API proxy failed: {ads_response.get('error_details')}")

         except Exception as e:
             status = "internal_error"; errors.append({"code": "ADS_API_ERROR", "message": str(e)})
             logger.error(f"Failed to execute bid adjustment via Ads Tool API proxy: {e}", exc_info=True)

         response = self._create_miz_oki_response(input_data, status, response_payload, errors if errors else None)
         response["metadata"]["processing_duration_ms"] = (time.monotonic() - start_time) * 1000
         return response

# --- Hyperdimensional Personalization Tool (Reworked Async) ---
class HyperdimensionalPersonalizationTool:
    """ Generates personalization async via deployed Tool APIs proxies. Deployed as a service. """
    def __init__(self, config: EnhancedConfig, kg_tool_proxy: Any, nn_tool_proxy: Any, moe_registry_proxy: Any, expert_invoker_proxy: Any, fm_client_proxy: Optional[Any] = None, crm_tool_proxy: Optional[Any] = None):
        if not all([config, kg_tool_proxy, nn_tool_proxy, moe_registry_proxy, expert_invoker_proxy]):
            raise InitializationError("HPTool requires config and proxies for KG, NN, MoE Registry, and Expert Invoker.")
        self.config = config
        self.kg_tool = kg_tool_proxy
        self.nn_tool = nn_tool_proxy
        self.moe_registry = moe_registry_proxy
        self.expert_invoker = expert_invoker_proxy
        self.fm_client = fm_client_proxy # Optional for content generation
        self.crm_tool = crm_tool_proxy     # Optional for CRM updates
        self.default_rec_engine_alias = "personalization_rec_v2" # Example alias
        self.default_content_gen_alias = config.foundation_models.defaults.llama4_scout
        self.logger = logging.getLogger('MIZ-OKI.HyperPersonalizationTool')
        self.logger.info("HyperPersonalization Tool logic initialized (Reworked).")

    def _create_miz_oki_response(self, request_data: Dict, status: str, payload: Optional[Dict] = None, errors: Optional[List[Dict]] = None) -> Dict:
        """Helper to construct a standard MIZ OKI response."""
        return {
            "miz_oki_version": self.config.miz_oki_schema_version,
            "request_id": request_data.get("request_id"), "trace_id": request_data.get("trace_id"),
            "workflow_execution_id": request_data.get("workflow_execution_id"), "step_id": request_data.get("step_id"),
            "timestamp": datetime.now(datetime.timezone.utc).isoformat(),
            "source_component": "HyperdimensionalPersonalizationTool", "target_component": request_data.get("source_component"),
            "status": status, "payload": payload, "error_details": errors, "metadata": {}
        }

    async def _get_user_profile_and_embedding(self, user_id: str, trace_id: Optional[str]) -> Tuple[Optional[Dict], Optional[List[float]]]:
        """ Gets profile via KG Tool API proxy and embedding via NN/FM Client Tool API proxy. """
        if not self.kg_tool or not self.nn_tool:
            logger.error("HP: Missing KG or NN Tool proxy for profile/embedding retrieval.")
            return None, None
        user_data = None; embedding = None
        try:
            # Call KG Tool API proxy to get user entity data
            kg_request = {"payload": {"mizId": user_id}, "trace_id": trace_id, "request_id": f"kg_get_user_{user_id}"}
            kg_response = await self.kg_tool.get_entity_endpoint(request=kg_request) # Call API proxy method
            user_data = kg_response.get("payload", {}).get("entity_data") if kg_response.get("status") == "success" else None

            if not user_data:
                logger.warning(f"HP: User profile not found in KG for user_id: {user_id}")
                return None, None

            # --- Construct text representation from user_data for embedding ---
            profile_parts = [f"Segment: {user_data.get('segment', 'Unknown')}", f"Interests: {user_data.get('interests', [])}"]
            # Add recent activity summary if available
            # profile_parts.append(f"Recent: {user_data.get('recent_activity_summary', '')}")
            profile_text = ". ".join(p for p in profile_parts if p)
            if not profile_text: profile_text = f"User {user_id}" # Fallback
            # --- End Construct ---

            # Call NN/FM Client Tool API proxy for embedding
            nn_request = {
                "payload": {"data": profile_text, "data_type": "user_profile"}, # Pass text and type hint
                "trace_id": trace_id, "request_id": f"nn_embed_user_{user_id}"
            }
            nn_response = await self.nn_tool.get_embedding(input_data=nn_request) # Call API proxy method
            embedding = nn_response.get("payload", {}).get("embedding") if nn_response.get("status") == "success" else None

            if embedding is None:
                logger.warning(f"HP: Failed to get embedding for user {user_id} via NN/FM Tool API.")

            return user_data, embedding
        except Exception as e:
            logger.error(f"HP: Error getting profile/embedding for {user_id} via API proxies: {e}", exc_info=True)
            return user_data, embedding # Return potentially partial results

    async def get_personalized_recommendations(self, input_data: Dict[str, Any]) -> Dict[str, Any]:
        """ Generates recommendations via MoE Invoker API proxy. Expects/Returns MIZ OKI. """
        start_time = time.monotonic(); errors = []
        # Parse MIZ OKI input
        payload = input_data.get("payload", {}); user_id = payload.get("user_id"); k = payload.get("k", 5); context = payload.get("context", {})
        trace_id = input_data.get("trace_id"); request_id = input_data.get("request_id")

        if not user_id: errors.append({"code": "MISSING_USER_ID", "message": "'user_id' is required."})
        if not self.moe_registry or not self.expert_invoker: errors.append({"code": "MISSING_DEPENDENCY", "message": "MoE Registry or Expert Invoker proxy unavailable."})
        if errors:
            response = self._create_miz_oki_response(input_data, "bad_request", errors=errors)
            response["metadata"]["processing_duration_ms"] = (time.monotonic() - start_time) * 1000
            return response

        recommendations = []; status = "pending"; response_payload = None
        try:
            profile, embedding = await self._get_user_profile_and_embedding(user_id, trace_id)
            if embedding is None: raise ValueError(f"No profile/embedding found for user {user_id}.")

            # Find recommendation expert via MoE Registry API proxy
            expert_id = await self.moe_registry.find_expert_for_task(task_type="recommendation", domain="e-commerce", context={"user_id": user_id}) # Example domain
            if not expert_id: raise RuntimeError(f"Recommendation expert not found via MoE Registry.")

            expert_details = await self.moe_registry.get_expert_details(expert_id)
            expert_endpoint = expert_details.get("endpoint") if expert_details else None
            if not expert_endpoint: raise RuntimeError(f"Endpoint not found for recommendation expert '{expert_id}'.")

            # --- TODO: Fetch item catalog context if needed by the expert model ---
            # item_catalog = await self._fetch_item_catalog(context)
            # --- End TODO ---

            # Prepare input for the recommendation expert model
            expert_input_payload = {
                "user_id": user_id,
                "user_embedding": embedding,
                "num_recommendations": k,
                "context": context, # Pass additional context (e.g., current page, time)
                # "item_catalog": item_catalog # Pass item data if needed
            }

            # Call Expert Invoker API proxy
            invoker_request = {
                "payload": {"endpoint": expert_endpoint, "data": expert_input_payload},
                "trace_id": trace_id, "request_id": f"invoker_rec_{request_id}"
            }
            invoker_response = await self.expert_invoker.invoke(request=invoker_request) # Call API proxy

            if invoker_response.get("status") == "success":
                result_payload = invoker_response.get("payload", {})
                # --- TODO: Adapt parsing based on actual expert model output format ---
                recommendations = result_payload.get("recommendations", []) # Expecting a list of item IDs or objects
                # --- End TODO ---
                status = "success"; response_payload = {"recommendations": recommendations[:k]}
                logger.info(f"HP Recs: Generated {len(recommendations)} recs for {user_id} via MoE API proxy '{expert_id}'.")
            else:
                raise RuntimeError(f"Expert Invoker API call failed for rec expert '{expert_id}': {invoker_response.get('error_details')}")

        except (ValueError, RuntimeError) as vr_e:
             status = "failed"; errors.append({"code": "REC_FAILED", "message": str(vr_e)}); logger.warning(f"HP Recs: Recommendation generation failed for {user_id}: {vr_e}")
        except Exception as e:
             status = "internal_error"; errors.append({"code": "INTERNAL_ERROR", "message": str(e)}); logger.error(f"HP Recs: Recommendation generation failed unexpectedly for {user_id}: {e}", exc_info=True)

        response = self._create_miz_oki_response(input_data, status, response_payload, errors if errors else None)
        response["metadata"]["processing_duration_ms"] = (time.monotonic() - start_time) * 1000
        return response

    async def generate_personalized_content(self, input_data: Dict[str, Any]) -> Dict[str, Any]:
        """ Generates content via FM Client API proxy. Expects/Returns MIZ OKI. """
        start_time = time.monotonic(); errors = []
        # Parse MIZ OKI input
        payload = input_data.get("payload", {}); user_id = payload.get("user_id"); content_type = payload.get("content_type", "email_subject"); base_content = payload.get("base_content"); context = payload.get("context", {})
        trace_id = input_data.get("trace_id"); request_id = input_data.get("request_id")

        if not user_id or not base_content: errors.append({"code": "MISSING_DATA", "message": "'user_id' and 'base_content' are required."})
        if not self.fm_client: errors.append({"code": "MISSING_DEPENDENCY", "message": "FM Client Tool proxy unavailable."})
        if errors:
            response = self._create_miz_oki_response(input_data, "bad_request", errors=errors)
            response["metadata"]["processing_duration_ms"] = (time.monotonic() - start_time) * 1000
            return response

        personalized_content = base_content # Default
        status = "pending"; response_payload = {"personalized_content": base_content}; response_metadata = {} # Default payload

        try:
            profile, _ = await self._get_user_profile_and_embedding(user_id, trace_id)
            if not profile: raise ValueError(f"No profile found for user {user_id}.")

            # Construct a concise profile summary for the prompt
            profile_summary = json.dumps({
                k: profile.get(k) for k in ['name', 'segment', 'interests', 'recent_purchase_category'] if profile.get(k) # Example fields
            }, default=str, separators=(',', ':'))

            # Construct prompt for personalization
            prompt = f"""Personalize the following content for the user based on their profile and the context.
User Profile: {profile_summary}
Context: {json.dumps(context, default=str)}
Content Type: {content_type}
Base Content: "{base_content}"
Personalized Content:"""

            model_alias = self.default_content_gen_alias
            logger.debug(f"HP Content Gen Prompt (first 200): {prompt[:200]}...")

            # Call FM Client API proxy
            fm_request = {
                "payload": {"prompt": prompt, "model_alias": model_alias, "temperature": 0.7, "max_tokens": 256}, # Adjust params as needed
                "trace_id": trace_id, "request_id": f"fm_hp_content_{request_id}"
            }
            fm_response = await self.fm_client.generate_text(input_data=fm_request) # Call API proxy

            if fm_response.get("status") == "success":
                generated_text = fm_response.get("payload",{}).get("generated_text")
                if generated_text:
                    personalized_content = generated_text.strip().strip('"') # Clean up output
                    status = "success"
                else:
                    logger.warning(f"HP Content Gen: FM Client API proxy returned empty content for {user_id}.")
                    status = "success_no_change" # Indicate no change was made
                response_metadata = fm_response.get("metadata", {}) # Get metadata from FM call
            else:
                raise RuntimeError(f"FM Client API proxy failed: {fm_response.get('error_details')}")

            response_payload["personalized_content"] = personalized_content

        except (ValueError, RuntimeError) as vr_e:
             status = "failed"; errors.append({"code": "CONTENT_GEN_FAILED", "message": str(vr_e)}); logger.warning(f"HP Content Gen: Failed for {user_id}: {vr_e}")
        except Exception as e:
             status = "internal_error"; errors.append({"code": "INTERNAL_ERROR", "message": str(e)}); logger.error(f"HP Content Gen: Failed unexpectedly for {user_id}: {e}", exc_info=True)

        response = self._create_miz_oki_response(input_data, status, response_payload, errors if errors else None)
        # Add processing time, potentially merging with FM metadata if it exists
        proc_duration = (time.monotonic() - start_time) * 1000
        response["metadata"] = response_metadata # Start with FM metadata
        response["metadata"]["hp_tool_processing_duration_ms"] = proc_duration
        return response

    async def update_crm_segment(self, input_data: Dict[str, Any]) -> Dict[str, Any]:
         """ Updates CRM segment via CRM Tool API proxy. Expects/Returns MIZ OKI. """
         start_time = time.monotonic(); errors = []
         # Parse MIZ OKI input
         payload = input_data.get("payload", {}); user_id = payload.get("user_id"); segment = payload.get("predicted_segment")
         trace_id = input_data.get("trace_id"); request_id = input_data.get("request_id")

         if not user_id or not segment: errors.append({"code": "MISSING_DATA", "message": "'user_id' and 'predicted_segment' are required."})
         if not self.crm_tool: errors.append({"code": "MISSING_DEPENDENCY", "message": "CRM Tool proxy unavailable."})
         if errors:
             response = self._create_miz_oki_response(input_data, "bad_request", errors=errors)
             response["metadata"]["processing_duration_ms"] = (time.monotonic() - start_time) * 1000
             return response

         status = "pending"; response_payload = None
         try:
              # Call CRM Tool API proxy
              crm_request = {
                  "payload": {"customer_id": user_id, "segment": segment}, # Adapt payload keys as needed by CRM tool
                  "trace_id": trace_id, "request_id": f"crm_update_seg_{request_id}"
              }
              crm_response = await self.crm_tool.update_customer_segment(request=crm_request) # Call proxy method

              if crm_response.get("status") == "success":
                   status = "success"; response_payload = crm_response.get("payload", {})
                   logger.info(f"Successfully updated CRM segment for {user_id} via API proxy.")
              else:
                   raise RuntimeError(f"CRM Tool API proxy failed: {crm_response.get('error_details')}")

         except Exception as e:
             status = "internal_error"; errors.append({"code": "CRM_API_ERROR", "message": str(e)})
             logger.error(f"Failed to update CRM segment for {user_id} via API proxy: {e}", exc_info=True)

         response = self._create_miz_oki_response(input_data, status, response_payload, errors if errors else None)
         response["metadata"]["processing_duration_ms"] = (time.monotonic() - start_time) * 1000
         return response

# --- Adaptive Workflow Evolution Service (Reworked - Uses REAL Vertex Client Proxies) ---
class AdaptiveWorkflowEvolutionService:
    """ Defines, initiates, monitors, and adapts Vertex AI Workflows using REAL client library proxies. Deployed as a service. """
    def __init__(self, config: EnhancedConfig, kg_tool_proxy: Any, fm_client_proxy: Any,
                 workflows_client_proxy: Optional[Any] = None, # Inject REAL client proxies
                 executions_client_proxy: Optional[Any] = None):
        if not all([config, kg_tool_proxy, fm_client_proxy]):
             raise InitializationError("AWEService requires config and proxies for KG and FM tools.")
        self.config = config
        self.kg_tool = kg_tool_proxy
        self.fm_client = fm_client_proxy
        self.project = config.gcp.project_id
        self.location = config.gcp.region
        self.wf_client = workflows_client_proxy # Use injected client proxy
        self.exec_client = executions_client_proxy # Use injected client proxy
        # TODO: Use persistent storage (e.g., GCS, DB) for definitions cache in production
        self.workflow_definitions_cache = TTLCache(maxsize=100, ttl=3600) # 1 hour TTL cache
        self.logger = logging.getLogger('MIZ-OKI.AWEService')
        if not self.wf_client or not self.exec_client:
            self.logger.critical("AWE Service cannot function: Vertex AI Workflow/Execution client proxies missing.")
        elif not VERTEX_WORKFLOWS_SDK_AVAILABLE:
             self.logger.warning("AWE Service initialized, but Vertex Workflows SDK seems unavailable. Functionality limited.")
        self.logger.info("Adaptive Workflow Evolution Service logic initialized (Reworked).")

    def _create_miz_oki_response(self, request_data: Dict, status: str, payload: Optional[Dict] = None, errors: Optional[List[Dict]] = None) -> Dict:
        """Helper to construct a standard MIZ OKI response."""
        return {
            "miz_oki_version": self.config.miz_oki_schema_version,
            "request_id": request_data.get("request_id"), "trace_id": request_data.get("trace_id"),
            "workflow_execution_id": request_data.get("workflow_execution_id"), "step_id": request_data.get("step_id"),
            "timestamp": datetime.now(datetime.timezone.utc).isoformat(),
            "source_component": "AdaptiveWorkflowEvolutionService", "target_component": request_data.get("source_component"),
            "status": status, "payload": payload, "error_details": errors, "metadata": {}
        }

    async def _get_workflow_parent(self) -> str:
        """Constructs the parent path for workflow resources."""
        if not self.project or not self.location: raise ValueError("GCP Project ID/Location missing.")
        return f"projects/{self.project}/locations/{self.location}"

    async def define_workflow(self, input_data: Dict[str, Any]) -> Dict[str, Any]:
        """ Defines/Updates Vertex AI Workflow via client proxy. Expects/Returns MIZ OKI. """
        start_time = time.monotonic(); errors = []
        # Parse MIZ OKI input
        payload = input_data.get("payload", {}); workflow_id = payload.get("workflow_id"); definition = payload.get("definition_source_code"); description = payload.get("description"); labels = payload.get("labels")
        trace_id = input_data.get("trace_id"); request_id = input_data.get("request_id")

        if not self.wf_client or not VERTEX_WORKFLOWS_SDK_AVAILABLE: errors.append({"code": "MISSING_DEPENDENCY", "message": "Vertex Workflows client proxy/SDK unavailable."})
        if not workflow_id or not definition: errors.append({"code": "MISSING_DATA", "message": "'workflow_id' and 'definition_source_code' required."})
        if errors:
            response = self._create_miz_oki_response(input_data, "bad_request", errors=errors)
            response["metadata"]["processing_duration_ms"] = (time.monotonic() - start_time) * 1000
            return response

        self.logger.info(f"AWE: Defining/Updating Vertex AI Workflow: {workflow_id}")
        status = "pending"; response_payload = None
        try:
            parent = await self._get_workflow_parent()
            workflow_name = f"{parent}/workflows/{workflow_id}"
            # Construct the Workflow object using the SDK's type
            workflow_obj_dict = {
                "name": workflow_name,
                "description": description or f"MIZ OKI Managed Workflow: {workflow_id}",
                "source_contents": definition,
                "labels": labels or {"miz_oki_managed": "true", "miz_oki_version": self.config.miz_oki_schema_version}
            }
            workflow_proto = workflows_v1.Workflow()
            json_format.ParseDict(workflow_obj_dict, workflow_proto._pb) # Use _pb for proto access

            action = "unknown"
            try:
                 # Try update first - Check existence using get_workflow
                 get_request = workflows_v1.GetWorkflowRequest(name=workflow_name)
                 await self.wf_client.get_workflow(request=get_request) # Call proxy method
                 # If get succeeds, update
                 mask = field_mask_pb2.FieldMask(paths=["description", "source_contents", "labels"])
                 update_request = workflows_v1.UpdateWorkflowRequest(workflow=workflow_proto, update_mask=mask)
                 operation = await self.wf_client.update_workflow(request=update_request) # Call proxy method
                 # Wait for Long-Running Operation (LRO) completion
                 # The result() method might block, consider using polling or callbacks for truly async
                 await asyncio.to_thread(operation.result, timeout=180)
                 action = "updated"
                 self.logger.info(f"AWE: Workflow '{workflow_id}' updated successfully.")
            except google_api_exceptions.NotFound:
                 # Create if not found
                 create_request = workflows_v1.CreateWorkflowRequest(parent=parent, workflow=workflow_proto, workflow_id=workflow_id)
                 operation = await self.wf_client.create_workflow(request=create_request) # Call proxy method
                 await asyncio.to_thread(operation.result, timeout=180) # Wait for LRO completion
                 action = "created"
                 self.logger.info(f"AWE: Workflow '{workflow_id}' created successfully.")

            self.workflow_definitions_cache[workflow_id] = definition # Update cache
            status = "success"
            response_payload = {"workflow_id": workflow_id, "action": action, "workflow_name": workflow_name}

        except google_api_exceptions.GoogleAPIError as api_e:
            status = "api_error"; errors.append({"code": "VERTEX_API_ERROR", "message": str(api_e)}); logger.error(f"AWE API Error defining/updating Vertex Workflow '{workflow_id}': {api_e}", exc_info=True)
        except Exception as e:
            status = "internal_error"; errors.append({"code": "AWE_DEFINE_ERROR", "message": str(e)}); logger.error(f"AWE Error defining/updating Vertex Workflow '{workflow_id}': {e}", exc_info=True)

        response = self._create_miz_oki_response(input_data, status, response_payload, errors if errors else None)
        response["metadata"]["processing_duration_ms"] = (time.monotonic() - start_time) * 1000
        return response

    async def execute_workflow(self, input_data: Dict[str, Any]) -> Dict[str, Any]:
        """ Initiates Vertex AI workflow execution via client proxy. Expects/Returns MIZ OKI. """
        start_time = time.monotonic(); errors = []
        # Parse MIZ OKI input
        payload = input_data.get("payload", {}); workflow_id = payload.get("workflow_id"); initial_context = payload.get("initial_context", {})
        trace_id = input_data.get("trace_id"); request_id = input_data.get("request_id")

        if not self.exec_client or not VERTEX_WORKFLOWS_SDK_AVAILABLE: errors.append({"code": "MISSING_DEPENDENCY", "message": "Vertex Executions client proxy/SDK unavailable."})
        if not workflow_id: errors.append({"code": "MISSING_DATA", "message": "'workflow_id' is required."})
        if errors:
            response = self._create_miz_oki_response(input_data, "bad_request", errors=errors)
            response["metadata"]["processing_duration_ms"] = (time.monotonic() - start_time) * 1000
            return response

        status = "pending"; response_payload = None
        run_id = f"wf_exec_{workflow_id}_{uuid.uuid4().hex[:8]}" # Internal run ID for logging
        self.logger.info(f"AWE: Starting Vertex AI workflow '{workflow_id}' (Run ID: {run_id}).")
        try:
            parent = await self._get_workflow_parent()
            workflow_name = f"{parent}/workflows/{workflow_id}"

            # The initial_context IS the MIZ OKI payload for the *first step* of the target workflow
            first_step_miz_oki_input = {
                "miz_oki_version": self.config.miz_oki_schema_version,
                "request_id": f"req_wf_start_{run_id}", "trace_id": trace_id,
                "workflow_execution_id": None, "step_id": "start", # Execution ID filled by Vertex
                "source_component": "AdaptiveWorkflowEvolutionService",
                "target_component": f"Workflow:{workflow_id}:Step1", # Conceptual target
                "payload": initial_context or {}
            }
            execution_args = json.dumps(first_step_miz_oki_input) # Argument must be JSON string

            execution_proto = Execution(argument=execution_args) # Use real proto type
            request = CreateExecutionRequest(parent=workflow_name, execution=execution_proto) # Use real request type

            # Call the REAL client proxy method
            exec_response = await self.exec_client.create_execution(request=request)
            execution_name = exec_response.name # Full execution name: projects/.../executions/...

            status = "success"
            response_payload = {"execution_name": execution_name, "run_id": run_id}
            self.logger.info(f"AWE Run {run_id}: Vertex AI execution created: {execution_name}")

        except google_api_exceptions.NotFound:
            status = "not_found"; errors.append({"code": "WORKFLOW_NOT_FOUND", "message": f"Workflow '{workflow_id}' not found."}); logger.error(f"Workflow '{workflow_id}' not found.")
        except google_api_exceptions.GoogleAPIError as api_e:
            status = "api_error"; errors.append({"code": "VERTEX_API_ERROR", "message": str(api_e)}); logger.error(f"AWE API Error initiating workflow run for '{workflow_id}': {api_e}", exc_info=True)
        except Exception as e:
            status = "internal_error"; errors.append({"code": "AWE_EXECUTE_ERROR", "message": str(e)}); logger.error(f"AWE Error initiating workflow run for '{workflow_id}': {e}", exc_info=True)

        response = self._create_miz_oki_response(input_data, status, response_payload, errors if errors else None)
        response["metadata"]["processing_duration_ms"] = (time.monotonic() - start_time) * 1000
        return response

    async def monitor_and_adapt(self, input_data: Dict[str, Any]) -> Dict[str, Any]:
        """ Monitors workflow performance and triggers evolution. Expects/Returns MIZ OKI. """
        start_time = time.monotonic(); errors = []
        # Parse MIZ OKI input
        payload = input_data.get("payload", {}); execution_id_to_monitor = payload.get("execution_id"); workflow_id_to_monitor = payload.get("workflow_id") # Can monitor specific exec or workflow
        trace_id = input_data.get("trace_id"); request_id = input_data.get("request_id")

        self.logger.info(f"AWE: Running monitor & adapt cycle (Exec:{execution_id_to_monitor}, WF:{workflow_id_to_monitor})...")
        status = "pending"; response_payload = None; evolution_triggered = False

        # --- TODO: Implement Real Monitoring Logic ---
        # 1. Fetch execution history/metrics:
        #    - Use `self.exec_client.list_executions` with filters (status=FAILED, duration > threshold).
        #    - Query Cloud Logging for errors associated with workflow/execution IDs.
        #    - Query KG Tool API proxy for business impact linked to workflow outcomes.
        # 2. Analyze metrics against thresholds/baselines.
        # 3. If issue detected:
        #    a. Determine problematic workflow_id if only execution_id was given.
        #    b. Prepare context for evolution (reason, metrics).
        #    c. Trigger evolution asynchronously:
        #       asyncio.create_task(self.evolve_workflow(problem_wf_id, evolution_context, trace_id))
        #       evolution_triggered = True
        # --- Simulation ---
        if random.random() < 0.05: # Simulate issue detection
            problem_wf_id = workflow_id_to_monitor or (execution_id_to_monitor.split('/workflows/')[1].split('/executions/')[0] if execution_id_to_monitor and '/workflows/' in execution_id_to_monitor else None)
            if problem_wf_id:
                 self.logger.warning(f"AWE: Simulated issue with workflow '{problem_wf_id}'. Triggering evolution.")
                 evolution_context = {"reason": "Simulated performance degradation", "trigger_trace_id": trace_id}
                 # Trigger evolution asynchronously (fire-and-forget for this monitor cycle)
                 asyncio.create_task(self.evolve_workflow(problem_wf_id, evolution_context, trace_id))
                 evolution_triggered = True
            else: logger.warning("AWE: Simulated issue detected but could not determine workflow ID.")
        # --- End Simulation ---

        status = "success" # Monitoring cycle completed
        response_payload = {"monitoring_complete": True, "evolution_triggered": evolution_triggered}

        response = self._create_miz_oki_response(input_data, status, response_payload, errors if errors else None)
        response["metadata"]["processing_duration_ms"] = (time.monotonic() - start_time) * 1000
        return response

    async def evolve_workflow(self, workflow_id: str, context: Optional[Dict] = None, trace_id: Optional[str] = None):
        """ Evolves workflow definition via FM Client API proxy and re-deploys via Vertex API proxy. """
        if not self.wf_client or not self.fm_client or not VERTEX_WORKFLOWS_SDK_AVAILABLE:
            logger.error(f"AWE: Cannot evolve workflow '{workflow_id}'. Client proxies/SDK unavailable."); return
        self.logger.info(f"AWE: Starting evolution process for workflow '{workflow_id}'...")
        context = context or {}
        evolution_id = f"evolve_{workflow_id}_{uuid.uuid4().hex[:6]}"

        try:
            # 1. Get current definition via Vertex API proxy
            parent = await self._get_workflow_parent()
            workflow_name = f"{parent}/workflows/{workflow_id}"
            get_request = workflows_v1.GetWorkflowRequest(name=workflow_name)
            current_wf = await self.wf_client.get_workflow(request=get_request) # Call proxy method
            current_source = current_wf.source_contents
            self.workflow_definitions_cache[workflow_id] = current_source # Update cache

            # 2. Analyze & Suggest Modifications via FM Client API proxy
            analysis_prompt = f"""Analyze the following Vertex AI Workflow YAML definition for potential improvements based on the provided context.
Workflow ID: {workflow_id}
Reason for Evolution: {context.get('reason', 'General optimization request')}
Current Workflow Source:
```yaml
{current_source}


 model_alias = self.config.foundation_models.defaults.llama4_maverick # Use powerful model for code generation
        fm_request = {
            "payload": {"prompt": analysis_prompt, "model_alias": model_alias, "max_tokens": 4096, "temperature": 0.2}, # Low temp for code
            "trace_id": trace_id, "request_id": f"fm_evolve_{evolution_id}"
        }
        fm_response = await self.fm_client.generate_text(input_data=fm_request) # Call API proxy

        if fm_response.get("status") != "success":
            raise RuntimeError(f"AWE: LLM suggestion generation failed via FM Client API: {fm_response.get('error_details')}")

        generated_text = fm_response.get("payload", {}).get("generated_text", "")

        # 3. Extract and Validate YAML
        match = re.search(r"```(?:yaml)?\s*(.*?)\s*```", generated_text, re.DOTALL)
        new_source = match.group(1).strip() if match else generated_text.strip() # Extract content within ```yaml ... ```

        if not new_source or new_source == current_source.strip():
            logger.info(f"AWE: LLM suggested no changes for workflow '{workflow_id}'. Evolution cycle complete.")
            # --- TODO: Log no-change event ---
            return # No changes needed

        # --- TODO: Add YAML validation step here ---
        try:
            import yaml
            yaml.safe_load(new_source)
            logger.debug("Evolved YAML syntax is valid.")
        except ImportError:
             logger.warning("PyYAML not installed. Skipping YAML validation for evolved workflow.")
        except Exception as yaml_e:
             raise ValueError(f"Generated YAML is invalid: {yaml_e}")
        # --- End TODO ---

        # 4. Deploy Update via Vertex API proxy (Calls internal define_workflow method of this service)
        self.logger.info(f"AWE: Deploying LLM-suggested update for workflow '{workflow_id}'.")
        define_request = {
            "payload": {
                "workflow_id": workflow_id,
                "definition_source_code": new_source,
                "description": f"Evolved by AWE: {context.get('reason', 'Auto-optimization')} ({datetime.now(datetime.timezone.utc).isoformat()})",
                "labels": {"miz_oki_managed": "true", "miz_oki_version": self.config.miz_oki_schema_version, "awe_evolved": "true"}
            },
            "trace_id": trace_id, "request_id": f"awe_define_{evolution_id}"
        }
        define_response = await self.define_workflow(input_data=define_request) # Call internal method

        if define_response.get("status") != "success":
            raise RuntimeError(f"Failed to deploy evolved workflow '{workflow_id}': {define_response.get('error_details')}")

        self.logger.info(f"AWE: Successfully evolved and deployed workflow '{workflow_id}'.")
        # --- TODO: Log evolution event to KG or monitoring system ---
        # Example: kg_request = {"payload": {"entity_data": {"type": "WorkflowEvolutionEvent", ...}}, "trace_id": trace_id}
        # await self.kg_tool.add_entity_endpoint(request=kg_request)
        # --- End TODO ---

    except google_api_exceptions.NotFound:
        logger.error(f"AWE Evolve Error: Workflow '{workflow_id}' not found.")
    except ValueError as ve: # Catch validation errors
         logger.error(f"AWE Evolve Error for '{workflow_id}': {ve}")
    except Exception as e:
        logger.error(f"AWE: Error evolving workflow '{workflow_id}': {e}", exc_info=True)

SyntaxError: unterminated triple-quoted string literal (detected at line 1025) (<ipython-input-11-a960b2712cbc>, line 958)

In [12]:
#cell7.1
--- Other Business App Tool Stubs (Reworked - Conceptual API Calls via Proxies) ---
These tools would be deployed as separate services and called via MIZ OKI API.
class RealTimeBiddingTool: """Handles real-time bidding decisions. Deployed as a service.""" def init(self, beab_tool_proxy: Any, ads_platform_tool_proxy: Any, config: EnhancedConfig): if not config or not beab_tool_proxy: raise InitializationError("RTBTool requires config and BEAB Tool proxy.") self.beab_tool = beab_tool_proxy self.ads_platform_tool = ads_platform_tool_proxy # Optional for immediate execution self.config = config self.logger = logging.getLogger('MIZ-OKI.RTBTool') self.logger.info("RTB Tool logic initialized.")

def _create_miz_oki_response(self, request_data: Dict, status: str, payload: Optional[Dict] = None, errors: Optional[List[Dict]] = None) -> Dict:
    """Helper to construct a standard MIZ OKI response."""
    return {
        "miz_oki_version": self.config.miz_oki_schema_version,
        "request_id": request_data.get("request_id"), "trace_id": request_data.get("trace_id"),
        "workflow_execution_id": request_data.get("workflow_execution_id"), "step_id": request_data.get("step_id"),
        "timestamp": datetime.now(datetime.timezone.utc).isoformat(),
        "source_component": "RealTimeBiddingTool", "target_component": request_data.get("source_component"),
        "status": status, "payload": payload, "error_details": errors, "metadata": {}
    }

async def process_bid_request(self, input_data: Dict) -> Dict: # Expects MIZ OKI bid request payload
    """Processes a bid request, calculates bid, optionally executes."""
    start_time = time.monotonic(); errors = []
    payload = input_data.get("payload", {}); context = payload.get("context", {}); base_bid = payload.get("bid_floor", 0.05)
    trace_id = input_data.get("trace_id"); request_id = input_data.get("request_id")
    status = "pending"; response_payload = None

    if not self.beab_tool: errors.append({"code": "MISSING_DEPENDENCY", "message": "BEAB Tool proxy unavailable."})
    if errors:
        response = self._create_miz_oki_response(input_data, "config_error", errors=errors)
        response["metadata"]["processing_duration_ms"] = (time.monotonic() - start_time) * 1000
        return response

    try:
        # 1. Calculate Adjusted Bid via BEAB Tool API Proxy
        beab_request = {
            "payload": {"base_bid": base_bid, "context": context},
            "trace_id": trace_id, "request_id": f"beab_calc_{request_id}"
        }
        beab_response = await self.beab_tool.calculate_adjusted_bid(input_data=beab_request) # Call proxy
        calc_log = beab_response.get("payload", {}) # Get the log details from BEAB response
        response_payload = {"calculation_log": calc_log} # Include BEAB log

        if beab_response.get("status") == "success":
            adjusted_bid = calc_log.get("adjusted_bid", 0)
            # Decide whether to bid (e.g., if adjusted bid > floor)
            should_bid = adjusted_bid > base_bid # Simple logic example
            if should_bid:
                status = "success_bid_calculated"
                response_payload["bid_value"] = adjusted_bid
                logger.info(f"RTB: Calculated bid {adjusted_bid:.4f} for request {request_id}.")
                # --- Optional: Execute bid immediately via Ads Platform Tool API Proxy ---
                # if self.ads_platform_tool:
                #     exec_payload = {**context, "new_bid": adjusted_bid, "platform": context.get("platform")} # Adapt payload
                #     exec_request = {"payload": exec_payload, "trace_id": trace_id, "request_id": f"ads_exec_rtb_{request_id}"}
                #     exec_response = await self.ads_platform_tool.adjust_bid(request=exec_request)
                #     response_payload["execution_status"] = exec_response.get("status")
                #     response_payload["execution_details"] = exec_response.get("payload") or exec_response.get("error_details")
                #     if exec_response.get("status") != "success": logger.warning("RTB: Immediate bid execution failed.")
                # --- End Optional Execution ---
            else:
                status = "success_no_bid"
                response_payload["bid_value"] = 0.0
                logger.info(f"RTB: No bid placed for request {request_id}. Adjusted bid {adjusted_bid:.4f} <= base {base_bid:.4f}.")
        else:
            # Propagate error from BEAB tool
            status = "failed"
            errors.append({"code": "BEAB_TOOL_ERROR", "message": "BEAB tool failed to calculate bid.", "details": beab_response.get("error_details")})

    except Exception as e:
        status = "internal_error"; errors.append({"code": "RTB_ERROR", "message": str(e)})
        logger.error(f"RTB Tool Error processing bid request {request_id}: {e}", exc_info=True)

    response = self._create_miz_oki_response(input_data, status, response_payload, errors if errors else None)
    response["metadata"]["processing_duration_ms"] = (time.monotonic() - start_time) * 1000
    return response
class AdOptimizationTool: """Optimizes ad campaigns (targeting, budget, creative). Deployed as a service.""" def init(self, config: EnhancedConfig, moe_registry_proxy: Any, expert_invoker_proxy: Any, fm_client_proxy: Any, ads_platform_tool_proxy: Any, kg_tool_proxy: Any): if not all([config, moe_registry_proxy, expert_invoker_proxy, fm_client_proxy, ads_platform_tool_proxy, kg_tool_proxy]): raise InitializationError("AdOptTool requires config and proxies for MoE, Invoker, FM, Ads Platform, and KG.") self.config = config; self.moe_registry = moe_registry_proxy; self.expert_invoker = expert_invoker_proxy; self.fm_client = fm_client_proxy; self.ads_platform_tool = ads_platform_tool_proxy; self.kg_tool = kg_tool_proxy self.logger = logging.getLogger('MIZ-OKI.AdOptTool') self.logger.info("Ad Optimization Tool logic initialized.")

def _create_miz_oki_response(self, request_data: Dict, status: str, payload: Optional[Dict] = None, errors: Optional[List[Dict]] = None) -> Dict:
    """Helper to construct a standard MIZ OKI response."""
    return {
        "miz_oki_version": self.config.miz_oki_schema_version,
        "request_id": request_data.get("request_id"), "trace_id": request_data.get("trace_id"),
        "workflow_execution_id": request_data.get("workflow_execution_id"), "step_id": request_data.get("step_id"),
        "timestamp": datetime.now(datetime.timezone.utc).isoformat(),
        "source_component": "AdOptimizationTool", "target_component": request_data.get("source_component"),
        "status": status, "payload": payload, "error_details": errors, "metadata": {}
    }

async def optimize_campaign(self, input_data: Dict) -> Dict: # Expects MIZ OKI payload
    """Optimizes a specific campaign based on performance data and goals."""
    start_time = time.monotonic(); errors = []
    payload = input_data.get("payload", {}); campaign_id = payload.get("campaign_id"); platform = payload.get("platform")
    trace_id = input_data.get("trace_id"); request_id = input_data.get("request_id")

    if not campaign_id or not platform: errors.append({"code": "MISSING_DATA", "message": "'campaign_id' and 'platform' are required."})
    # Add dependency checks here
    if errors:
        response = self._create_miz_oki_response(input_data, "bad_request", errors=errors)
        response["metadata"]["processing_duration_ms"] = (time.monotonic() - start_time) * 1000
        return response

    status = "pending"; response_payload = None; actions_taken = []
    try:
        # --- TODO: Implement full optimization logic ---
        self.logger.info(f"AdOpt Tool: Starting optimization for {platform}:{campaign_id} (Placeholder Logic)")
        # 1. Fetch Data: Call KG Tool API proxy to get campaign details, performance metrics, goals.
        #    kg_request = {"payload": {"mizId": campaign_id}, "trace_id": trace_id}
        #    kg_response = await self.kg_tool.get_entity_endpoint(request=kg_request)
        #    campaign_data = kg_response.get("payload", {}).get("entity_data") if kg_response.get("status") == "success" else {}

        # 2. Analyze Performance: Compare metrics against goals. Maybe use an MoE expert via Invoker API proxy.
        #    expert_id = await self.moe_registry.find_expert_for_task(task_type="performance_analysis", domain=platform)
        #    invoker_request = {"payload": {"endpoint": ..., "data": {"campaign_data": campaign_data}}, "trace_id": trace_id}
        #    analysis_response = await self.expert_invoker.invoke(request=invoker_request)
        #    analysis = analysis_response.get("payload", {}) if analysis_response.get("status") == "success" else {}

        # 3. Decide Actions: Based on analysis, decide on budget, targeting, or creative changes.
        budget_change = None; targeting_change = None; creative_needed = False
        #    if analysis.get("performance_issue") == "under_budget": budget_change = campaign_data.get("budget", 100) * 1.1
        #    if analysis.get("performance_issue") == "poor_targeting": targeting_change = {"new_audience": "segment_y"}
        #    if analysis.get("creative_fatigue", False): creative_needed = True

        # 4. Generate Creatives (if needed): Call FM Client API proxy.
        if creative_needed:
             fm_request = {"payload": {"prompt": f"Generate 3 new ad text variations for campaign {campaign_id} about [product/service]", "model_alias": "llama4_scout"}, "trace_id": trace_id}
             fm_response = await self.fm_client.generate_text(input_data=fm_request)
             if fm_response.get("status") == "success":
                 new_creatives = fm_response.get("payload",{}).get("generated_text")
                 # TODO: Call Ads Platform API proxy to upload/create new creatives
                 # ads_creative_request = {"payload": {"platform": platform, "campaign_id": campaign_id, "creatives": new_creatives}, "trace_id": trace_id}
                 # creative_response = await self.ads_platform_tool.create_ad_creative(request=ads_creative_request)
                 # if creative_response.get("status") == "success": actions_taken.append({"type": "create_creative", "details": creative_response.get("payload")})
                 actions_taken.append({"type": "generate_creative", "status": "generated", "text_preview": str(new_creatives)[:50]}) # Placeholder action log

        # 5. Execute Budget/Targeting Changes: Call Ads Platform API proxy.
        if budget_change is not None:
             ads_budget_request = {"payload": {"platform": platform, "campaign_id": campaign_id, "new_budget": budget_change}, "trace_id": trace_id}
             ads_response = await self.ads_platform_tool.update_budget(request=ads_budget_request) # Call proxy
             if ads_response.get("status") == "success": actions_taken.append({"type": "update_budget", "new_budget": budget_change, "status": "success"})
             else: actions_taken.append({"type": "update_budget", "new_budget": budget_change, "status": "failed", "error": ads_response.get("error_details")})
        # Add similar logic for targeting changes...

        # --- End Placeholder Logic ---

        logger.info(f"AdOpt Tool: Completed optimization cycle for {campaign_id}. Actions: {len(actions_taken)}")
        status = "success" # Assume success if no critical error occurred during placeholder logic
        response_payload = {"campaign_id": campaign_id, "actions_taken": actions_taken}

    except Exception as e:
        status = "internal_error"; errors.append({"code": "ADOPT_ERROR", "message": str(e)})
        logger.error(f"AdOpt Tool Error for {campaign_id}: {e}", exc_info=True)

    response = self._create_miz_oki_response(input_data, status, response_payload, errors if errors else None)
    response["metadata"]["processing_duration_ms"] = (time.monotonic() - start_time) * 1000
    return response
--- Other Business App Tool Stubs (Need full implementation as services/tools) ---
class BusinessInsightsTool: """Generates business insights and alerts. Deployed as a service.""" # Needs KG, FM proxies async def generate_predictive_alert(self, input_data: Dict) -> Dict: logger.warning("BI Tool generate_predictive_alert not implemented."); return {"status": "not_implemented"}

class StrategyToExecutionBridgeTool: """Translates strategy to executable actions/workflows. Deployed as a service.""" # Needs KG, FM, AWE proxies/clients async def translate_strategy(self, input_data: Dict) -> Dict: logger.warning("StrategyBridge Tool translate_strategy not implemented."); return {"status": "not_implemented"}

--- Initialization (Conceptual - Tools instantiated by framework/orchestrator) ---
This would happen in the main application entry point or service factory
_ads_platform_tool_proxy: Optional[AdsPlatformApiToolProxy] = None
_crm_tool_proxy: Optional[CrmApiToolProxy] = None
_privacy_controls_tool: Optional[PrivacyControlsTool] = None
_beab_tool: Optional[BrandEquityAwareBiddingTool] = None
_hp_tool: Optional[HyperdimensionalPersonalizationTool] = None
_awe_service: Optional[AdaptiveWorkflowEvolutionService] = None
_rtb_tool: Optional[RealTimeBiddingTool] = None
_ad_optimization_tool: Optional[AdOptimizationTool] = None
async def initialize_business_apps():
global _ads_platform_tool_proxy, _crm_tool_proxy, _privacy_controls_tool, _beab_tool, _hp_tool, _awe_service, _rtb_tool, _ad_optimization_tool
if not _config_obj or not _real_dependencies:
logger.critical("Cannot initialize Business Apps: Config or core dependencies missing.")
return
try:
_ads_platform_tool_proxy = AdsPlatformApiToolProxy(_config_obj, _config_obj.service_endpoints.ads_platform_api_endpoint)
await _ads_platform_tool_proxy.initialize()
_crm_tool_proxy = CrmApiToolProxy(_config_obj, _config_obj.service_endpoints.crm_api_endpoint)
await _crm_tool_proxy.initialize()
_privacy_controls_tool = PrivacyControlsTool(_config_obj) # Sync init ok
_beab_tool = BrandEquityAwareBiddingTool(_config_obj, _kg_tool_proxy, _moe_registry_proxy, _expert_invoker_proxy, _xai_proxy, _ads_platform_tool_proxy)
_hp_tool = HyperdimensionalPersonalizationTool(_config_obj, _kg_tool_proxy, _nn_tool_proxy, _moe_registry_proxy, _expert_invoker_proxy, _fm_client_proxy, _crm_tool_proxy)
_awe_service = AdaptiveWorkflowEvolutionService(_config_obj, _kg_tool_proxy, _fm_client_proxy, _workflows_client_proxy, _workflow_executions_client_proxy) # Inject real clients
_rtb_tool = RealTimeBiddingTool(_beab_tool, _ads_platform_tool_proxy)
_ad_optimization_tool = AdOptimizationTool(_config_obj, _moe_registry_proxy, _expert_invoker_proxy, _fm_client_proxy, _ads_platform_tool_proxy, _kg_tool_proxy)
# Initialize other tools...
logger.info("Business Application Tools/Services initialized.")
except Exception as e:
logger.critical(f"Business Application Tools initialization failed: {e}", exc_info=True)
# Set all to None on failure
_ads_platform_tool_proxy = _crm_tool_proxy = _privacy_controls_tool = _beab_tool = _hp_tool = _awe_service = _rtb_tool = _ad_optimization_tool = None
async def cleanup_business_apps():
if _ads_platform_tool_proxy: await _ads_platform_tool_proxy.cleanup()
if _crm_tool_proxy: await _crm_tool_proxy.cleanup()
# Add cleanup for other tools if needed
print("\n--- MIZ 3.0 Business Applications Layer Logic (Cell 7 - Reworked) ---") print("AWE Service uses real Vertex AI Client proxies (if available). App Tools use real dependencies/proxies via MIZ OKI APIs.") print("Requires implementation of AWE monitoring/evolution logic, App Tool logic & External API Tools.") print("-----------------------------------------------------------------------")



SyntaxError: invalid syntax (<ipython-input-12-64148d231762>, line 2)

In [13]:
**Cell 8: Learning Flows Implementation (Reworked)**

*   **Original Purpose:** Define learning components (KD, CV, DRS, DRL), synchronous logic, placeholder dependencies, inline training.
*   **Key Changes:** Logic refactored into async `Tool`/`Service` classes. KD, CV, DRS tools interact with dependencies (FM, LI, Optimizer, Pub/Sub) via injected *proxies/clients*. `DistributedRLManager` triggers *external* MLOps training via Pub/Sub client proxy. Shows async GCS pattern for buffer saving (requires `aio-gcsfs`). All public methods handle MIZ OKI payloads. State persistence (DRL buffer, CV queue state, etc.) needs explicit implementation using external stores (GCS, BQ, Firestore).
*   **Reworked Code:**

```python
# Cell 8: Learning Flows Implementation (Reworked)
# Status: KD, CV, DRS use real dependencies/proxies via MIZ OKI APIs.
#         DRL Manager uses real PubSub proxy to trigger external MLOps training.
#         Async GCS pattern shown (requires aio-gcsfs).
#         Placeholders remain for drift/bias detection, DRL buffer persistence/loading.

import logging
import time
import random
import numpy as np
import pandas as pd
import asyncio
from typing import Dict, Any, Optional, List, Union, Callable, Tuple
from collections import deque, defaultdict, Counter # Added Counter
import json
import uuid
import os # Added for GCS path joining
import aiofiles # For async file operations if needed locally

# --- Assume Real Tool/Client Dependencies are Injected/Available ---
# These proxies represent API clients for other deployed MIZ OKI services or GCP clients.
try:
    # Use CONFIG_OBJ loaded in Cell 1
    if 'CONFIG_OBJ' not in globals() or not CONFIG_OBJ: raise NameError("CONFIG_OBJ not found or is None")
    if not isinstance(CONFIG_OBJ, EnhancedConfig): raise NameError("CONFIG_OBJ is not an EnhancedConfig instance")

    # Proxies for other MIZ OKI Tool APIs
    if 'foundation_model_client' not in globals(): raise NameError("foundation_model_client proxy not found") # Cell 18 Proxy
    if 'li_tool' not in globals(): raise NameError("li_tool (LearningIntegrationTool instance/proxy) not found") # Cell 5 Proxy
    if 'optimizer_tool' not in globals(): raise NameError("optimizer_tool (HolisticOptimizerTool instance/proxy) not found") # Cell 5 Proxy

    # Real/Mock Client for GCP Pub/Sub
    if '_pubsub_client' not in globals(): raise NameError("_pubsub_client not found") # Cell 8 needs this

    _config_obj = CONFIG_OBJ
    _fm_client_proxy = foundation_model_client
    _li_tool_proxy = li_tool
    _optimizer_tool_proxy = optimizer_tool
    _pubsub_client_proxy = _pubsub_client # Use real/mock client proxy
    _real_dependencies = True
    logger.debug("Using real/conceptual dependencies in Cell 8 (Reworked).")

    # Check for aio-gcsfs for async GCS operations
    try:
        import aio_gcsfs; import gcsfs
        AIO_GCS_AVAILABLE = True
    except ImportError:
        AIO_GCS_AVAILABLE = False
        logging.warning("aio-gcsfs/gcsfs not installed. Async GCS operations will be simulated.")

except NameError as e:
    logger.warning(f"Dependency Error in Cell 8 ({e}). Using Mocks/Placeholders.")
    _real_dependencies = False
    AIO_GCS_AVAILABLE = False
    # --- Mock/Placeholder Setup ---
    class MockFMClientTool: async def generate_text(self, input_data): await asyncio.sleep(0.01); return {"status": "success", "payload": {"generated_text": "Mock"}}; async def analyze(self, input_data): await asyncio.sleep(0.01); return {"status": "success", "payload": {'sentiment': 'positive'}}
    class MockLITool: async def integrate_learning(self, input_data): await asyncio.sleep(0.01); return {"status": "success", "payload": {"integration_id": "mock_li"}}
    class MockOptimizerTool: async def get_current_objective_priorities(self, input_data): await asyncio.sleep(0.01); return {"status": "success", "payload": {"priorities": {'mock': 0.5}}}
    class MockPubSubClient: async def publish(self, topic, data_bytes): await asyncio.sleep(0.01); return f"msg_{uuid.uuid4().hex[:8]}"
    # Define minimal config if needed
    if 'CONFIG_OBJ' not in globals() or not CONFIG_OBJ:
        from dataclasses import dataclass, field
        @dataclass class MockGcpConfig: project_id:Optional[str]="mock-proj"; region:str="mock-region"; gcs_bucket_name:Optional[str]="mock-bucket"
        @dataclass class MockFmDefaults: llama4_maverick: str = "mock-llama"; feedback_analyzer_model: str = "mock-analyzer"
        @dataclass class MockFmConfig: defaults: MockFmDefaults = field(default_factory=MockFmDefaults)
        @dataclass class MockLearningFlowsConfig: kd: Dict = field(default_factory=lambda: {"output_gcs_prefix": "kd/"}); cv: Dict = field(default_factory=lambda: {"feedback_queue_maxsize": 100, "monitoring_interval_seconds": 60, "feedback_analyzer_model_alias": "mock-analyzer"}); drs: Dict = field(default_factory=lambda: {'base_weights': {}, "update_interval_seconds": 60, "objective_influence_factor": 0.3}); drl: Dict = field(default_factory=lambda: {"buffer_size": 100, "buffer_save_interval_sec": 60, "min_buffer_for_train": 10, "buffer_gcs_prefix": "rl/"})
        @dataclass class MockConfig: gcp: MockGcpConfig = field(default_factory=MockGcpConfig); mlops_trigger_topic:str="mock-topic"; mlops_rl_train_topic:str="mock-rl"; foundation_models: MockFmConfig = field(default_factory=MockFmConfig); learning_flows: MockLearningFlowsConfig = field(default_factory=MockLearningFlowsConfig); miz_oki_schema_version: str = "3.0"; def get(self, key, default=None): parts=key.split('.'); val=self; try: [val := getattr(val, p) for p in parts]; return val; except: return default
        _config_obj = MockConfig()

    _fm_client_proxy = MockFMClientTool(); _li_tool_proxy = MockLITool(); _optimizer_tool_proxy = MockOptimizerTool(); _pubsub_client_proxy = MockPubSubClient()
    # --- End Mock/Placeholder Setup ---

logger = logging.getLogger('MIZ-OKI.LearningFlows')

# --- Knowledge Distillation Tool (Reworked Async - Uses Real FM Client Proxy) ---
class KnowledgeDistillationTool:
    """ Handles distilling knowledge async via FM Client API proxy. Deployed as a service. """
    def __init__(self, fm_client_proxy: Any, config: EnhancedConfig):
        if not config or not fm_client_proxy:
            raise InitializationError("KnowledgeDistillationTool requires config and FM Client proxy.")
        self.fm_client = fm_client_proxy
        self.config = config
        self.teacher_model_alias = config.get('learning_flows.kd.teacher_model_alias', config.foundation_models.defaults.llama4_maverick)
        self.output_bucket = config.gcp.gcs_bucket_name
        self.output_prefix = config.get('learning_flows.kd.output_gcs_prefix', 'kd_outputs/')
        self.logger = logging.getLogger('MIZ-OKI.KnowledgeDistillationTool')
        self.logger.info(f"KnowledgeDistillation Tool logic initialized with teacher: {self.teacher_model_alias} (Reworked).")

    def _create_miz_oki_response(self, request_data: Dict, status: str, payload: Optional[Dict] = None, errors: Optional[List[Dict]] = None) -> Dict:
        """Helper to construct a standard MIZ OKI response."""
        return {
            "miz_oki_version": self.config.miz_oki_schema_version,
            "request_id": request_data.get("request_id"), "trace_id": request_data.get("trace_id"),
            "workflow_execution_id": request_data.get("workflow_execution_id"), "step_id": request_data.get("step_id"),
            "timestamp": datetime.now(datetime.timezone.utc).isoformat(),
            "source_component": "KnowledgeDistillationTool", "target_component": request_data.get("source_component"),
            "status": status, "payload": payload, "error_details": errors, "metadata": {}
        }

    async def _save_teacher_outputs_gcs(self, target_gcs_path: str, outputs: List[Any]) -> bool:
        """ Saves teacher outputs to GCS asynchronously using aio-gcsfs if available. """
        self.logger.info(f"Saving {len(outputs)} teacher outputs to {target_gcs_path}...")
        if not AIO_GCS_AVAILABLE:
            logger.warning(f"Async GCS save to {target_gcs_path} simulated (aio-gcsfs not installed).")
            await asyncio.sleep(0.05) # Simulate I/O
            return True # Simulate success for placeholder

        try:
            # Use sync GCSFS for checking/creating directory (less critical path)
            gcs_dir = os.path.dirname(target_gcs_path)
            fs = gcsfs.GCSFileSystem(project=self.config.gcp.project_id)
            if not fs.exists(gcs_dir):
                fs.makedirs(gcs_dir)
                logger.info(f"Created GCS directory: {gcs_dir}")

            # Use async aio-gcsfs for writing the file content
            afs = aio_gcsfs.GCSFileSystem(project=self.config.gcp.project_id)
            # Serialize each output item as a JSON line
            output_str = "\n".join(json.dumps(item, default=str) for item in outputs)
            async with afs.open(target_gcs_path, 'wb') as f:
                await f.write(output_str.encode('utf-8'))
            self.logger.info(f"Teacher outputs saved successfully to GCS: {target_gcs_path}")
            return True
        except Exception as e:
            logger.error(f"Failed to save teacher outputs to GCS async ({target_gcs_path}): {e}", exc_info=True)
            return False

    async def distill_knowledge(self, input_data: Dict[str, Any]) -> Dict[str, Any]:
        """ Performs teacher generation async. Expects/Returns MIZ OKI payload. """
        start_time = time.monotonic(); errors = []
        # Parse MIZ OKI input
        payload = input_data.get("payload", {}); student_model_details = payload.get("student_model_details", {}); dataset_ref = payload.get("dataset_ref", {}); distillation_params = payload.get("distillation_params", {})
        trace_id = input_data.get("trace_id"); request_id = input_data.get("request_id")

        student_name = student_model_details.get("name", f"kd_student_{uuid.uuid4().hex[:6]}")
        kd_run_id = f"kd_run_{student_name}_{uuid.uuid4().hex[:8]}"
        self.logger.info(f"Starting async KD teacher generation for student: {student_name} (RunID: {kd_run_id})")

        if not self.fm_client: errors.append({"code": "MISSING_DEPENDENCY", "message": "FM Client Tool proxy unavailable."})
        if not self.output_bucket: errors.append({"code": "CONFIG_ERROR", "message": "GCS output bucket not configured."})
        if errors:
            response = self._create_miz_oki_response(input_data, "config_error", errors=errors)
            response["metadata"]["processing_duration_ms"] = (time.monotonic() - start_time) * 1000
            return response

        teacher_output_path = f"gs://{self.output_bucket}/{self.output_prefix.strip('/')}/{student_name}_{kd_run_id}_teacher.jsonl"
        teacher_outputs = []; status = "pending"; response_payload = None

        try:
            # --- TODO: Implement robust data loading based on dataset_ref ---
            # If dataset_ref['gcs_uri'] exists, load data from GCS asynchronously.
            # For now, using 'inputs_preview' as the source data.
            input_items = dataset_ref.get("inputs_preview", [])
            if not input_items or not isinstance(input_items, list):
                raise ValueError("No valid input data provided in 'dataset_ref.inputs_preview'.")
            # --- End TODO ---

            self.logger.info(f"KD {kd_run_id}: Generating teacher outputs using {self.teacher_model_alias} for {len(input_items)} items...")

            # Generate teacher predictions async via FM Client API proxy
            teacher_tasks = []
            batch_size = 20 # Batch calls to FM API
            for i in range(0, len(input_items), batch_size):
                batch_inputs = input_items[i:i+batch_size]
                # --- TODO: Construct appropriate prompts based on task type ---
                # Example for summarization:
                prompts = [f"Input: {inp}\nSummary:" for inp in batch_inputs]
                # --- End TODO ---
                fm_request = {
                    "payload": {"prompt": prompts, "model_alias": self.teacher_model_alias, "max_tokens": 256}, # Example params
                    "trace_id": trace_id, "request_id": f"kd_fm_batch_{i//batch_size}_{kd_run_id}"
                }
                teacher_tasks.append(self.fm_client.generate_text(input_data=fm_request)) # Call API proxy

            batch_responses = await asyncio.gather(*teacher_tasks, return_exceptions=True)

            # Process results, pairing input with output
            output_pairs = []
            input_idx = 0
            for fm_response in batch_responses:
                if isinstance(fm_response, Exception):
                    errors.append({"code": "FM_API_ERROR", "message": f"FM API call failed during batch: {fm_response}"})
                    # Skip results for this failed batch
                    input_idx += batch_size # Approximate skip
                    continue
                if fm_response.get("status") == "success":
                    generated_texts = fm_response.get("payload", {}).get("generated_text", [])
                    # Ensure generated_texts is a list, even if single prompt was sent
                    if not isinstance(generated_texts, list): generated_texts = [generated_texts]
                    batch_input_slice = input_items[input_idx : input_idx + len(generated_texts)]
                    for inp, output_text in zip(batch_input_slice, generated_texts):
                        output_pairs.append({"input": inp, "teacher_prediction": output_text})
                    input_idx += len(generated_texts)
                else:
                    errors.append({"code": "FM_TOOL_ERROR", "message": f"FM Tool API failed: {fm_response.get('error_details')}"})
                    input_idx += batch_size # Approximate skip

            teacher_outputs = output_pairs
            if not teacher_outputs:
                # If there were input items but no outputs (e.g., all API calls failed), raise error
                if input_items and not errors: errors.append({"code": "KD_NO_OUTPUTS", "message": "Teacher generation yielded no outputs despite valid inputs."})
                if not errors: errors.append({"code": "KD_NO_OUTPUTS", "message": "Teacher generation yielded no outputs."})
                raise RuntimeError("Teacher generation yielded no valid outputs.")


            # Save teacher outputs to GCS (async)
            save_success = await self._save_teacher_outputs_gcs(teacher_output_path, teacher_outputs)
            if not save_success:
                raise RuntimeError(f"Failed to save teacher outputs to {teacher_output_path}")

            status = "success" if not errors else "partial_success"
            response_payload = {"teacher_output_path": teacher_output_path, "outputs_generated": len(teacher_outputs)}
            self.logger.info(f"KD {kd_run_id}: Teacher generation finished. Status: {status}. Outputs at: {teacher_output_path}")

        except Exception as e:
            status = "internal_error"; errors.append({"code": "KD_ERROR", "message": str(e)})
            self.logger.error(f"KD teacher generation FAILED (RunID: {kd_run_id}): {e}", exc_info=True)

        response = self._create_miz_oki_response(input_data, status, response_payload, errors if errors else None)
        response["metadata"]["processing_duration_ms"] = (time.monotonic() - start_time) * 1000
        return response

# --- Continuous Validation Service (Reworked Async - Uses Real FM/LI Tool Proxies) ---
class ContinuousValidationService:
    """ Monitors performance, detects issues, processes feedback async via Tool APIs. Deployed as a long-running service or scheduled job. """
    def __init__(self, fm_client_proxy: Any, learning_integrator_tool_proxy: Any, config: EnhancedConfig):
        if not config or not fm_client_proxy or not learning_integrator_tool_proxy:
            raise InitializationError("ContinuousValidationService requires config and proxies for FM Client and LI Tool.")
        self.fm_client = fm_client_proxy
        self.learning_integrator_tool = learning_integrator_tool_proxy
        self.config = config
        # Use asyncio.Queue for in-memory buffer; consider persistent queue (Pub/Sub, Task Queues) for production
        self.feedback_queue = asyncio.Queue(maxsize=config.get('learning_flows.cv.feedback_queue_maxsize', 1000))
        self.monitoring_interval = config.get('learning_flows.cv.monitoring_interval_seconds', 300)
        self.drift_threshold = config.get('learning_flows.cv.drift_detection_threshold', 0.1)
        self.bias_threshold = config.get('learning_flows.cv.bias_detection_threshold', 0.05)
        self.feedback_analyzer_alias = config.get('learning_flows.cv.feedback_analyzer_model_alias', config.foundation_models.defaults.feedback_analyzer_model)
        self._monitor_task: Optional[asyncio.Task] = None
        self.logger = logging.getLogger('MIZ-OKI.ContinuousValidationService')
        self.logger.info("ContinuousValidation Service logic initialized (Reworked).")

    async def start_monitoring(self):
        """Starts the background monitoring loop."""
        if self._monitor_task is None or self._monitor_task.done():
            self.logger.info(f"Starting CV monitor loop (interval: {self.monitoring_interval}s).")
            self._monitor_task = asyncio.create_task(self._monitoring_loop())
        else:
            self.logger.warning("CV Monitoring task already running.")

    async def stop_monitoring(self):
        """Stops the background monitoring loop gracefully."""
        if self._monitor_task and not self._monitor_task.done():
            self.logger.info("Stopping CV monitor loop...")
            self._monitor_task.cancel()
            try:
                await self._monitor_task
            except asyncio.CancelledError:
                self.logger.info("CV monitoring loop stopped.")
            finally:
                self._monitor_task = None
        else:
             self.logger.info("CV monitoring loop not running.")

    async def _monitoring_loop(self):
         """The core background loop for periodic validation checks."""
         while True:
             try:
                 await self.run_validation_cycle()
                 await asyncio.sleep(self.monitoring_interval)
             except asyncio.CancelledError:
                 self.logger.info("CV monitoring loop cancelled.")
                 break
             except Exception as e:
                 # Log error but continue loop
                 self.logger.error(f"Error in CV monitoring loop: {e}", exc_info=True)
                 # Optional: Implement backoff before retrying
                 await asyncio.sleep(self.monitoring_interval * 0.5) # Shorter sleep after error

    async def add_feedback(self, input_data: Dict[str, Any]): # Expects MIZ OKI payload
        """ Adds feedback item (MIZ OKI payload) to the processing queue asynchronously. """
        try:
            # Basic validation of input_data structure
            if not isinstance(input_data, dict) or "payload" not in input_data:
                 logger.warning("CV: Received invalid feedback format (missing payload). Discarding.")
                 return
            await self.feedback_queue.put(input_data) # No need for put_nowait if called from async context
            self.logger.debug(f"Added feedback to CV queue. Size: {self.feedback_queue.qsize()}")
        except asyncio.QueueFull:
            logger.warning(f"CV feedback queue is full (max: {self.feedback_queue.maxsize}). Discarding new feedback.")
        except Exception as e:
             logger.error(f"CV: Error adding feedback to queue: {e}", exc_info=True)

    async def _detect_drift(self, context: Dict) -> bool:
        """ Placeholder for data or model drift detection logic. """
        # --- TODO: Implement Drift Detection ---
        # - Fetch baseline data/predictions (e.g., from GCS, BQ, KG).
        # - Fetch current data/predictions.
        # - Use statistical tests (e.g., KS test, Population Stability Index) or monitoring services.
        # - Compare distributions or model performance metrics.
        # - Return True if drift > self.drift_threshold.
        # --- End TODO ---
        logger.debug("CV: Checking for drift async (Placeholder - Needs implementation).")
        await asyncio.sleep(random.uniform(0.05, 0.15)) # Simulate check time
        return random.random() < 0.05 # Simulate 5% chance of drift

    async def _detect_bias(self, context: Dict) -> bool:
        """ Placeholder for bias detection logic. """
        # --- TODO: Implement Bias Detection ---
        # - Requires labeled data with sensitive attributes (e.g., demographics).
        # - Fetch predictions for different subgroups.
        # - Calculate fairness metrics (e.g., demographic parity, equalized odds).
        # - Compare metrics against self.bias_threshold.
        # - Return True if bias detected.
        # --- End TODO ---
        logger.debug("CV: Checking for bias async (Placeholder - Needs implementation).")
        await asyncio.sleep(random.uniform(0.05, 0.15)) # Simulate check time
        return random.random() < 0.02 # Simulate 2% chance of bias

    async def _process_feedback_queue_items(self, trace_id: Optional[str] = None):
        """ Processes feedback from the queue via FM Client API proxy and triggers LI Tool API proxy. """
        processed_feedback_logs = []; items_processed = 0; max_items_per_cycle = 100 # Limit processing per cycle
        if not self.fm_client or not self.learning_integrator_tool:
            logger.error("CV cannot process feedback: FMClient or LI Tool proxy missing.")
            return # Exit if dependencies missing

        logger.debug(f"CV: Processing feedback queue (Current size: {self.feedback_queue.qsize()})...")
        while not self.feedback_queue.empty() and items_processed < max_items_per_cycle:
            try:
                item_input_data = await self.feedback_queue.get() # Get item from queue
                items_processed += 1
                item_payload = item_input_data.get("payload", {})
                feedback_text = item_payload.get('feedback')
                feedback_type = item_payload.get('type', 'general')
                item_request_id = item_input_data.get('request_id', uuid.uuid4().hex[:6]) # Get ID for logging

                # Analyze unstructured text feedback via FM Client API proxy
                if isinstance(feedback_text, str) and feedback_type == 'unstructured':
                    logger.debug(f"CV: Analyzing feedback via FM Client API proxy: '{feedback_text[:50]}...'")
                    try:
                        # Prepare MIZ OKI request for FM Client Tool
                        fm_request = {
                            "payload": {"text": feedback_text, "model_alias": self.feedback_analyzer_alias},
                            "trace_id": trace_id, "request_id": f"fm_analyze_fb_{item_request_id}"
                        }
                        fm_response = await self.fm_client.analyze(input_data=fm_request) # Call API proxy

                        if fm_response.get("status") == "success":
                            item_payload['analysis'] = fm_response.get("payload", {}) # Add analysis results
                        else:
                            item_payload['analysis'] = {'error': f"FM API proxy failed: {fm_response.get('error_details')}"}
                            logger.warning(f"CV: FM Client API proxy failed to analyze feedback: {fm_response.get('error_details')}")
                    except Exception as fm_e:
                        logger.error(f"CV: Exception calling FM Client API proxy for feedback analysis: {fm_e}", exc_info=True)
                        item_payload['analysis'] = {'error': f"Exception during FM API call: {fm_e}"}

                processed_feedback_logs.append(item_payload) # Add processed payload to batch
                self.feedback_queue.task_done() # Mark item as processed
            except asyncio.QueueEmpty:
                break # Should not happen with await get() unless queue becomes empty concurrently
            except Exception as q_e:
                logger.error(f"CV: Error processing item from feedback queue: {q_e}", exc_info=True)
                # Optionally put item back? Or log and discard? Logging and discarding for now.
                self.feedback_queue.task_done() # Ensure task_done is called even on error

        # Trigger Learning Integrator Tool API Proxy if feedback was processed
        if processed_feedback_logs:
            logger.info(f"CV: Processed {len(processed_feedback_logs)} feedback items. Triggering LI Tool API proxy.")
            try:
                 # Prepare MIZ OKI request for LI Tool
                 li_request = {
                     "payload": {
                         "knowledge_type": 'feedback_batch',
                         "knowledge_data": processed_feedback_logs,
                         "source": 'continuous_validation',
                         "importance": 0.7 # Example importance
                     },
                     "trace_id": trace_id, "request_id": f"li_integrate_fb_{trace_id or uuid.uuid4().hex[:6]}"
                 }
                 li_response = await self.learning_integrator_tool.integrate_learning(input_data=li_request) # Call API proxy

                 if li_response.get("status") == "success":
                     logger.info("CV: Feedback integration triggered successfully via LI Tool API proxy.")
                 else:
                     logger.error(f"CV: LI Tool API proxy call failed: {li_response.get('error_details')}")
            except Exception as li_e:
                logger.error(f"CV: Failed to trigger LI Tool API proxy for feedback batch: {li_e}", exc_info=True)
        else:
            logger.debug("CV: No feedback items processed in this cycle.")

    async def run_validation_cycle(self):
        """ Runs one validation cycle: checks drift/bias, processes feedback queue, triggers LI on issues. """
        cv_id = f"cv_cycle_{uuid.uuid4().hex[:8]}"; start_time = time.monotonic(); trace_id = f"trace_{cv_id}"
        self.logger.info(f"Starting CV cycle async (ID: {cv_id})...")
        issues_found = False; drift_detected = False; bias_detected = False

        try:
            # Run checks concurrently with feedback processing
            drift_task = self._detect_drift({}); bias_task = self._detect_bias({})
            feedback_task = self._process_feedback_queue_items(trace_id=trace_id)

            # Gather detection results, handling potential exceptions
            detection_results = await asyncio.gather(drift_task, bias_task, return_exceptions=True)
            drift_detected = detection_results[0] if not isinstance(detection_results[0], Exception) else False
            bias_detected = detection_results[1] if not isinstance(detection_results[1], Exception) else False
            if isinstance(detection_results[0], Exception): logger.error(f"CV {cv_id}: Drift detection failed: {detection_results[0]}")
            if isinstance(detection_results[1], Exception): logger.error(f"CV {cv_id}: Bias detection failed: {detection_results[1]}")

            # Ensure feedback processing finishes
            await feedback_task

            # Check if issues were detected
            if drift_detected: logger.warning(f"CV {cv_id}: Drift DETECTED (Threshold: {self.drift_threshold})."); issues_found = True
            if bias_detected: logger.warning(f"CV {cv_id}: Bias DETECTED (Threshold: {self.bias_threshold})."); issues_found = True

            # Trigger Learning Integrator Tool API Proxy if issues found
            if issues_found:
                if self.learning_integrator_tool:
                    logger.info(f"CV {cv_id}: Triggering LI Tool API proxy due to detected issues.")
                    try:
                         # Prepare MIZ OKI request for LI Tool
                         li_request = {
                             "payload": {
                                 "knowledge_type": 'validation_alert',
                                 "knowledge_data": {'drift_detected': drift_detected, 'bias_detected': bias_detected, 'cycle_id': cv_id},
                                 "source": 'continuous_validation_alert',
                                 "importance": 0.9 # High importance for validation alerts
                             },
                             "trace_id": trace_id, "request_id": f"li_alert_{cv_id}"
                         }
                         li_response = await self.learning_integrator_tool.integrate_learning(input_data=li_request) # Call API proxy

                         if li_response.get("status") == "success": logger.info(f"CV {cv_id}: LI trigger successful via API proxy.")
                         else: logger.error(f"CV {cv_id}: LI Tool API proxy call failed: {li_response.get('error_details')}")
                    except Exception as li_e:
                        logger.error(f"CV {cv_id}: Failed to trigger LI Tool API proxy for validation alert: {li_e}", exc_info=True)
                else:
                    logger.error(f"CV {cv_id}: Issues detected but LI Tool proxy unavailable!")

        except Exception as cycle_e:
             logger.error(f"CV cycle {cv_id} encountered an error: {cycle_e}", exc_info=True)

        duration = (time.monotonic() - start_time) * 1000
        self.logger.info(f"CV cycle {cv_id} finished in {duration:.2f} ms. Issues found: {issues_found}")

# --- Dynamic Reward System Tool (Reworked Async - Uses Real PO Tool Proxy) ---
class DynamicRewardSystemTool:
    """ Calculates rewards async based on outcomes and current objective priorities fetched via PO Tool API proxy. Deployed as a service. """
    def __init__(self, optimizer_tool_proxy: Any, config: EnhancedConfig):
        if not config or not optimizer_tool_proxy:
            raise InitializationError("DynamicRewardSystemTool requires config and Optimizer Tool proxy.")
        self.optimizer_tool = optimizer_tool_proxy
        self.config = config
        self.base_reward_weights = config.get('learning_flows.drs.base_weights', {'task_completion': 1.0, 'efficiency': 0.5})
        self.objective_influence_factor = config.get('learning_flows.drs.objective_influence_factor', 0.3)
        self.update_interval = config.get('learning_flows.drs.update_interval_seconds', 600)
        self.current_reward_weights = self.base_reward_weights.copy() # Start with base weights
        self._last_objective_update_time = 0
        self._update_lock = asyncio.Lock()
        self.logger = logging.getLogger('MIZ-OKI.DynamicRewardSystemTool')
        self.logger.info(f"DynamicRewardSystem Tool logic initialized (Reworked). Base weights: {self.base_reward_weights}")

    def _create_miz_oki_response(self, request_data: Dict, status: str, payload: Optional[Dict] = None, errors: Optional[List[Dict]] = None) -> Dict:
        """Helper to construct a standard MIZ OKI response."""
        return {
            "miz_oki_version": self.config.miz_oki_schema_version,
            "request_id": request_data.get("request_id"), "trace_id": request_data.get("trace_id"),
            "workflow_execution_id": request_data.get("workflow_execution_id"), "step_id": request_data.get("step_id"),
            "timestamp": datetime.now(datetime.timezone.utc).isoformat(),
            "source_component": "DynamicRewardSystemTool", "target_component": request_data.get("source_component"),
            "status": status, "payload": payload, "error_details": errors, "metadata": {}
        }

    async def _adjust_weights_for_objectives(self, trace_id: Optional[str] = None):
        """ Adjusts reward weights based on current objective priorities fetched via PO Tool API proxy. """
        now = time.monotonic()
        async with self._update_lock: # Prevent concurrent updates
            if now - self._last_objective_update_time < self.update_interval:
                logger.debug("DRS: Skipping objective weight adjustment, too soon.")
                return # Too soon to update

            if not self.optimizer_tool or not hasattr(self.optimizer_tool, 'get_current_objective_priorities'):
                logger.warning("DRS: Optimizer Tool proxy unavailable or method missing. Cannot adjust weights.")
                self._last_objective_update_time = now # Still update time to prevent rapid retries
                return

            self.logger.info("DRS: Adjusting reward weights based on current objectives...")
            try:
                # Call PO Tool API proxy
                po_request = {"miz_oki_version": "3.0", "trace_id": trace_id, "request_id": f"po_get_prio_{trace_id or uuid.uuid4().hex[:6]}"}
                po_response = await self.optimizer_tool.get_current_objective_priorities(input_data=po_request) # Call proxy method

                if po_response.get("status") == "success":
                    current_objective_priorities = po_response.get("payload", {}).get("priorities", {}) # Expects {'ObjectiveName': priority_score}
                    self.logger.info(f"DRS: Fetched objective priorities async via PO Tool API proxy: {current_objective_priorities}")

                    # --- Recalculate weights (Sync logic ok) ---
                    adjusted_weights = self.base_reward_weights.copy()
                    total_priority_score = sum(p for p in current_objective_priorities.values() if isinstance(p, (int, float)) and p > 0)

                    if total_priority_score > 0:
                        # Normalize priorities
                        normalized_priorities = {obj: score / total_priority_score for obj, score in current_objective_priorities.items() if isinstance(score, (int, float)) and score > 0}

                        # Adjust base weights based on priorities
                        # --- TODO: Implement mapping logic from objectives to reward metrics ---
                        # This requires knowing which metrics contribute to which objectives.
                        # Example: If 'ImproveQuality' objective has high priority, boost 'quality' reward weight.
                        # Example: If 'ReduceCost' objective has high priority, boost 'efficiency' reward weight.
                        for objective, priority in normalized_priorities.items():
                            # Find metrics related to this objective (e.g., from config or KG)
                            related_metrics = self.config.get(f"objective_metric_mapping.{objective}", []) # Example config path
                            for metric in related_metrics:
                                if metric in adjusted_weights:
                                    adjustment = priority * self.objective_influence_factor
                                    adjusted_weights[metric] = self.base_reward_weights[metric] * (1 + adjustment)
                                    logger.debug(f"DRS: Adjusted weight for '{metric}' by factor {1+adjustment} due to objective '{objective}' priority {priority:.2f}")
                        # --- End TODO ---

                    self.current_reward_weights = adjusted_weights
                    self.logger.info(f"DRS: Adjusted reward weights via PO Tool API proxy: {self.current_reward_weights}")
                else:
                    logger.error(f"DRS: Failed to get objective priorities from PO Tool API proxy: {po_response.get('error_details')}")
                    # Keep current weights on failure

                self._last_objective_update_time = now # Update time even on failure to prevent rapid retries

            except Exception as e:
                logger.error(f"DRS: Failed to adjust reward weights via PO Tool API proxy: {e}", exc_info=True)
                # Revert to base weights on unexpected error? Or keep current? Keeping current for now.
                # self.current_reward_weights = self.base_reward_weights.copy()
                self._last_objective_update_time = now # Update time

    async def calculate_reward(self, input_data: Dict[str, Any]) -> Dict[str, Any]:
        """ Calculates reward async based on outcome metrics and dynamic weights. Expects/Returns MIZ OKI payload. """
        start_time = time.monotonic(); errors = []
        # Parse MIZ OKI input
        payload = input_data.get("payload", {}); outcome_metrics = payload.get("outcome_metrics", {}) # e.g., {"task_completion": 1.0, "efficiency": 0.8, "quality": 0.9}
        trace_id = input_data.get("trace_id"); request_id = input_data.get("request_id")

        if not isinstance(outcome_metrics, dict): errors.append({"code": "INVALID_PAYLOAD", "message": "'payload.outcome_metrics' must be a dictionary."})
        if errors:
            response = self._create_miz_oki_response(input_data, "bad_request", errors=errors)
            response["metadata"]["processing_duration_ms"] = (time.monotonic() - start_time) * 1000
            return response

        status = "pending"; response_payload = None; total_reward = 0.0
        try:
            # Ensure weights are up-to-date based on objectives
            await self._adjust_weights_for_objectives(trace_id=trace_id)

            # Calculate weighted reward
            calculation_details = {}
            for metric, value in outcome_metrics.items():
                weight = self.current_reward_weights.get(metric)
                if weight is not None:
                     try:
                         metric_value = float(value)
                         # Ensure reward component is not NaN or Inf
                         if np.isnan(metric_value) or np.isinf(metric_value): raise ValueError("Metric value is NaN or Inf")
                         if np.isnan(weight) or np.isinf(weight): raise ValueError("Weight is NaN or Inf")

                         reward_component = metric_value * weight
                         if np.isnan(reward_component) or np.isinf(reward_component): raise ValueError("Calculated reward component is NaN or Inf")

                         total_reward += reward_component
                         calculation_details[metric] = {"value": metric_value, "weight": weight, "reward": reward_component}
                     except (ValueError, TypeError) as val_err:
                         logger.warning(f"DRS: Invalid value '{value}' or weight '{weight}' for metric '{metric}'. Skipping. Error: {val_err}")
                         calculation_details[metric] = {"value": value, "weight": weight, "error": f"Invalid value/weight: {val_err}"}
                else:
                     logger.debug(f"DRS: No weight defined for metric '{metric}'. Skipping.")
                     calculation_details[metric] = {"value": value, "weight": None, "reward": 0}

            # Ensure final reward is not NaN or Inf
            if np.isnan(total_reward) or np.isinf(total_reward):
                logger.error(f"DRS: Final calculated reward is NaN or Inf. Resetting to 0. Details: {calculation_details}")
                total_reward = 0.0
                errors.append({"code": "REWARD_NAN_INF", "message": "Calculated reward resulted in NaN or Inf."})
                status = "failed"
            else:
                status = "success"

            response_payload = {"reward": total_reward, "calculation_details": calculation_details, "weights_used": self.current_reward_weights}
            self.logger.debug(f"DRS: Calculated reward via Tool API: {total_reward:.4f}")

        except Exception as e:
             status = "internal_error"; errors.append({"code": "REWARD_CALC_ERROR", "message": str(e)})
             logger.error(f"DRS: Failed to calculate reward: {e}", exc_info=True)

        response = self._create_miz_oki_response(input_data, status, response_payload, errors if errors else None)
        response["metadata"]["processing_duration_ms"] = (time.monotonic() - start_time) * 1000
        return response

# --- Distributed Reinforcement Learning Manager (Reworked Async - Real PubSub Proxy, External Training) ---
class DistributedRLManager:
    """ Manages RL agent experiences & triggers external MLOps training via Pub/Sub client proxy. Deployed as a service. """
    def __init__(self, reward_system_tool_proxy: Any, pubsub_client_proxy: Any, config: EnhancedConfig):
        if not config or not reward_system_tool_proxy or not pubsub_client_proxy:
            raise InitializationError("DistributedRLManager requires config and proxies for DRS Tool and PubSub Client.")
        self.reward_system_tool = reward_system_tool_proxy
        self.pubsub_client = pubsub_client_proxy
        self.config = config
        self.agents: Dict[str, Any] = {} # Stores ADK agent references (e.g., service URLs or IDs) - Needs registration mechanism

        # --- TODO: Implement robust buffer persistence ---
        # Option 1: Use BigQuery table (good for querying, potential write limits)
        # Option 2: Partitioned GCS files (e.g., hourly/daily JSONL files) - Shown in placeholder save method
        # Option 3: Dedicated time-series DB or buffer service
        self.experience_buffer = deque(maxlen=config.get('learning_flows.drl.buffer_size', 50000)) # In-memory placeholder ONLY
        self.buffer_save_path_prefix = f"gs://{config.gcp.gcs_bucket_name}/{config.get('learning_flows.drl.buffer_gcs_prefix', 'rl_buffer/')}"
        # --- End TODO ---

        self.save_interval_seconds = config.get('learning_flows.drl.buffer_save_interval_sec', 600)
        self.min_buffer_for_train = config.get('learning_flows.drl.min_buffer_for_train', 1000)
        self.mlops_rl_train_topic_name = config.mlops_rl_train_topic
        self.project = config.gcp.project_id

        self._save_task: Optional[asyncio.Task] = None
        self.logger = logging.getLogger('MIZ-OKI.DistributedRLManager')
        self.logger.info(f"DistributedRLManager initialized (Reworked). Buffer Size: {self.experience_buffer.maxlen} (In-Memory Placeholder)")

    def _create_miz_oki_response(self, request_data: Dict, status: str, payload: Optional[Dict] = None, errors: Optional[List[Dict]] = None) -> Dict:
        """Helper to construct a standard MIZ OKI response."""
        return {
            "miz_oki_version": self.config.miz_oki_schema_version,
            "request_id": request_data.get("request_id"), "trace_id": request_data.get("trace_id"),
            "workflow_execution_id": request_data.get("workflow_execution_id"), "step_id": request_data.get("step_id"),
            "timestamp": datetime.now(datetime.timezone.utc).isoformat(),
            "source_component": "DistributedRLManager", "target_component": request_data.get("source_component"),
            "status": status, "payload": payload, "error_details": errors, "metadata": {}
        }

    # --- Agent Registration (Needs a proper mechanism in deployment) ---
    def register_agent(self, agent_id: str, agent_ref: Any):
        """Registers an RL agent (e.g., its service endpoint or ID)."""
        # In a real system, this might involve service discovery or a configuration update.
        self.agents[agent_id] = agent_ref
        self.logger.info(f"Registered RL agent: {agent_id}")

    def unregister_agent(self, agent_id: str):
        """Unregisters an RL agent."""
        if agent_id in self.agents:
            del self.agents[agent_id]
            self.logger.info(f"Unregistered RL agent: {agent_id}")
    # --- End Agent Registration ---

    async def add_experience(self, input_data: Dict[str, Any]): # Expects MIZ OKI payload
        """ Adds experience tuple (in payload) to the buffer asynchronously. """
        payload = input_data.get("payload", {})
        exp_tuple = payload.get("experience") # Expects (agent_id, state, action, reward, next_state, done, info)

        # Validate experience tuple structure
        if not isinstance(exp_tuple, (list, tuple)) or len(exp_tuple) < 6:
            logger.warning(f"DRL Mgr: Received invalid experience data format: {type(exp_tuple)}. Discarding.")
            return # Discard invalid data

        agent_id = exp_tuple[0]
        # Optional: Check if agent is registered (might be too slow for high-throughput)
        # if agent_id not in self.agents:
        #     logger.warning(f"DRL Mgr: Received experience from unregistered agent '{agent_id}'. Discarding.")
        #     return # Discard data from unknown agents

        # --- TODO: Add to persistent buffer store ---
        # Example: Append to a BigQuery table or write to a temporary file before batch GCS upload.
        # For now, using the in-memory deque placeholder.
        try:
            # Add timestamp to the experience tuple before storing
            timestamped_exp = (*exp_tuple, datetime.now(datetime.timezone.utc).isoformat())
            self.experience_buffer.append(timestamped_exp)
            # logger.debug(f"DRL Mgr: Collected experience from {agent_id}. Buffer size: {len(self.experience_buffer)}")
        except Exception as buf_e:
             logger.error(f"DRL Mgr: Error adding experience to buffer: {buf_e}", exc_info=True)
        # --- End TODO ---

    async def _save_buffer_to_gcs_async(self) -> Optional[str]:
        """ Saves buffer snapshot to GCS asynchronously. Needs robust implementation. Returns GCS path on success. """
        # --- TODO: Implement robust, incremental saving using async GCS writes (aio-gcsfs) ---
        # - Read from persistent buffer source (e.g., BQ table, recent GCS files).
        # - Aggregate data for the training job.
        # - Write aggregated data to a new GCS file.
        # - Handle serialization, partitioning for large buffers.
        # --- Placeholder Implementation (Saves in-memory deque) ---
        buffer_copy = list(self.experience_buffer) # Get snapshot
        if not buffer_copy:
            logger.info("DRL Mgr: Experience buffer is empty. Nothing to save.")
            return None

        # Create a timestamped filename
        ts = datetime.now(datetime.timezone.utc).strftime('%Y%m%d_%H%M%S_%f')
        filename = f"rl_experience_{ts}.jsonl"
        gcs_path = os.path.join(self.buffer_save_path_prefix.rstrip('/'), filename)
        self.logger.info(f"DRL Mgr: Saving {len(buffer_copy)} experiences to {gcs_path}...")

        if not AIO_GCS_AVAILABLE:
            logger.warning(f"Async GCS save to {gcs_path} simulated (aio-gcsfs not installed).")
            await asyncio.sleep(0.1) # Simulate I/O
            return gcs_path # Simulate success

        try:
            # Use sync GCSFS for checking/creating directory (less critical path)
            gcs_dir = os.path.dirname(gcs_path)
            fs = gcsfs.GCSFileSystem(project=self.config.gcp.project_id)
            if not fs.exists(gcs_dir):
                fs.makedirs(gcs_dir)
                logger.info(f"Created GCS directory: {gcs_dir}")

            # Use async aio-gcsfs for writing the file content
            afs = aio_gcsfs.GCSFileSystem(project=self.config.gcp.project_id)
            # Serialize each experience tuple as a JSON line
            # Ensure all elements are JSON serializable (e.g., numpy arrays converted to lists)
            def serialize_experience(exp):
                serializable_exp = []
                for item in exp:
                    if isinstance(item, np.ndarray): serializable_exp.append(item.tolist())
                    elif isinstance(item, (np.int_, np.intc, np.intp, np.int8, np.int16, np.int32, np.int64, np.uint8, np.uint16, np.uint32, np.uint64)): serializable_exp.append(int(item))
                    elif isinstance(item, (np.float_, np.float16, np.float32, np.float64)): serializable_exp.append(float(item))
                    elif isinstance(item, (np.bool_)): serializable_exp.append(bool(item))
                    elif isinstance(item, (datetime, datetime.date)): serializable_exp.append(item.isoformat())
                    else: serializable_exp.append(item)
                # Use default=str as a fallback for other non-serializable types
                return json.dumps(serializable_exp, default=str)

            output_str = "\n".join(serialize_experience(item) for item in buffer_copy)
            async with afs.open(gcs_path, 'wb') as f:
                await f.write(output_str.encode('utf-8'))
            self.logger.info(f"DRL Mgr: Experience buffer snapshot saved to {gcs_path}.")
            return gcs_path
        except Exception as e:
            logger.error(f"DRL Mgr: Failed to save experience buffer to {gcs_path}: {e}", exc_info=True)
            return None
        # --- End Placeholder ---

    async def start_buffer_saving(self):
        """Starts the periodic background task for saving the experience buffer."""
        if self._save_task is None or self._save_task.done():
            self.logger.info(f"Starting periodic RL buffer save task (Interval: {self.save_interval_seconds}s)...")
            self._save_task = asyncio.create_task(self._buffer_save_loop())
        else:
             self.logger.warning("DRL buffer save task already running.")

    async def stop_buffer_saving(self):
        """Stops the periodic background buffer saving task."""
         if self._save_task and not self._save_task.done():
             self.logger.info("Stopping DRL buffer save task...")
             self._save_task.cancel()
             try: await self._save_task
             except asyncio.CancelledError: pass # Expected exception on cancellation
             finally: self._save_task = None
             self.logger.info("DRL buffer save task stopped.")
         else:
              self.logger.info("DRL buffer save task not running.")

    async def _buffer_save_loop(self):
         """Background loop for periodically saving the buffer."""
         while True:
             try:
                 await asyncio.sleep(self.save_interval_seconds)
                 await self._save_buffer_to_gcs_async()
             except asyncio.CancelledError:
                 self.logger.info("DRL buffer save loop cancelled.")
                 break
             except Exception as e:
                 logger.error(f"Error in DRL buffer save loop: {e}", exc_info=True)
                 # Optional: Implement backoff before retrying save
                 await asyncio.sleep(self.save_interval_seconds * 0.5) # Shorter sleep after error

    async def trigger_training(self, input_data: Dict[str, Any]) -> Dict[str, Any]:
        """ Triggers external MLOps RL training pipeline via Pub/Sub client proxy. Expects/Returns MIZ OKI. """
        start_time = time.monotonic(); errors = []
        # Parse MIZ OKI input
        payload = input_data.get("payload", {}); force = payload.get("force", False)
        trace_id = input_data.get("trace_id"); request_id = input_data.get("request_id")

        if not self.pubsub_client: errors.append({"code": "MISSING_DEPENDENCY", "message": "PubSub client proxy unavailable."})
        if not self.project: errors.append({"code": "CONFIG_ERROR", "message": "GCP Project ID not configured."})
        if errors:
            response = self._create_miz_oki_response(input_data, "config_error", errors=errors)
            response["metadata"]["processing_duration_ms"] = (time.monotonic() - start_time) * 1000
            return response

        status = "pending"; response_payload = None
        try:
            # --- TODO: Get current buffer length from persistent store ---
            buffer_len = len(self.experience_buffer) # Using placeholder length
            # --- End TODO ---

            if not force and buffer_len < self.min_buffer_for_train:
                msg = f"Skipping training trigger. Buffer size {buffer_len} < {self.min_buffer_for_train}."
                logger.info(f"DRL Mgr: {msg}")
                status = "skipped"; response_payload = {"message": msg}
            else:
                self.logger.info(f"DRL Mgr: Triggering MLOps RL training pipeline via Pub/Sub proxy (Buffer size: {buffer_len}, Force: {force}).")
                # Ensure latest data is saved before triggering training
                latest_snapshot_uri = await self._save_buffer_to_gcs_async()
                if not latest_snapshot_uri:
                    raise RuntimeError("Failed to save experience buffer to GCS before triggering training.")

                pipeline_name = "miz3_rl_agent_training_pipeline" # Should be configurable
                # Parameters for the Vertex AI Pipeline job
                pipeline_params = {
                    "project": self.project,
                    "location": self.config.gcp.region,
                    "experience_data_uri": latest_snapshot_uri, # Pass GCS path to training data
                    "agent_ids": list(self.agents.keys()), # Pass IDs of agents to train/update
                    "timestamp_trigger": datetime.now(datetime.timezone.utc).isoformat(),
                    # Add other necessary pipeline parameters (e.g., hyperparameters, model output paths)
                    "output_model_dir": f"gs://{self.config.gcp.gcs_bucket_name}/rl_models/{datetime.now(datetime.timezone.utc).strftime('%Y%m%d%H%M%S')}/"
                }
                # MIZ OKI formatted message for Pub/Sub
                message_data = {
                    "miz_oki_version": self.config.miz_oki_schema_version,
                    "event_type": "trigger_mlops_pipeline",
                    "payload": {"pipeline_name": pipeline_name, "parameters": pipeline_params},
                    "metadata": {"trace_id": trace_id, "source_component": "DistributedRLManager"}
                }
                message_bytes = json.dumps(message_data).encode('utf-8')
                rl_train_topic_full_path = f"projects/{self.project}/topics/{self.mlops_rl_train_topic_name}"

                # Call Pub/Sub Client Proxy method
                message_id = await self.pubsub_client.publish(rl_train_topic_full_path, message_bytes)

                status = "success"
                response_payload = {"message_id": message_id, "pipeline_triggered": pipeline_name, "data_snapshot": latest_snapshot_uri}
                self.logger.info(f"DRL Mgr: Triggered MLOps RL pipeline via Pub/Sub proxy. Topic: {rl_train_topic_full_path}, Message ID: {message_id}.")
                # --- TODO: Optionally clear the buffer after successful trigger/save, depending on persistence strategy ---
                # self.experience_buffer.clear() # If using in-memory only
                # --- End TODO ---

        except Exception as e:
             status = "internal_error"; errors.append({"code": "TRAINING_TRIGGER_ERROR", "message": str(e)})
             logger.error(f"DRL Mgr: Failed to trigger training pipeline: {e}", exc_info=True)

        response = self._create_miz_oki_response(input_data, status, response_payload, errors if errors else None)
        response["metadata"]["processing_duration_ms"] = (time.monotonic() - start_time) * 1000
        return response

# --- Initialization (Conceptual - Services instantiated by framework/orchestrator) ---
# kd_tool: Optional[KnowledgeDistillationTool] = None
# cv_service: Optional[ContinuousValidationService] = None
# drs_tool: Optional[DynamicRewardSystemTool] = None
# drl_manager: Optional[DistributedRLManager] = None

# async def initialize_learning_flows():
#      global kd_tool, cv_service, drs_tool, drl_manager
#      if not _config_obj or not _real_dependencies:
#          logger.critical("Cannot initialize Learning Flows: Config or dependencies missing.")
#          return
#      try:
#          kd_tool = KnowledgeDistillationTool(_fm_client_proxy, _config_obj)
#          cv_service = ContinuousValidationService(_fm_client_proxy, _li_tool_proxy, _config_obj)
#          drs_tool = DynamicRewardSystemTool(_optimizer_tool_proxy, _config_obj)
#          drl_manager = DistributedRLManager(drs_tool, _pubsub_client_proxy, _config_obj)

#          # Start background tasks if deployed as services
#          await cv_service.start_monitoring()
#          await drl_manager.start_buffer_saving()

#          logger.info("Learning Flow Tools/Services initialized.")
#      except Exception as e:
#           logger.critical(f"Learning Flow Tools initialization failed: {e}", exc_info=True)
#           kd_tool = cv_service = drs_tool = drl_manager = None # Set all to None on failure

# async def cleanup_learning_flows():
#       if cv_service: await cv_service.stop_monitoring()
#       if drl_manager: await drl_manager.stop_buffer_saving()
#       # Add cleanup for other components if needed

print("\n--- MIZ 3.0 Learning Flows Layer Logic (Cell 8 - Reworked) ---")
print("KD, CV, DRS use real dependencies/proxies via MIZ OKI APIs. DRL Manager uses real PubSub proxy.")
print("Async GCS pattern shown (requires aio-gcsfs). Training logic externalized via MLOps Pub/Sub triggers.")
print("Requires implementation of drift/bias detection, robust DRL buffer persistence.")
print("------------------------------------------------------------------")

SyntaxError: invalid syntax (<ipython-input-13-f54153018474>, line 1)

In [14]:
# Cell 9: System Integration and Testing (Reworked)
# Status: Uses unittest.IsolatedAsyncioTestCase. Mocks refined for Vertex AI Workflow client.
#         Emphasizes need for MIZ OKI payload validation and better mocks/error path tests.

import unittest
import logging
import asyncio
from unittest.mock import patch, MagicMock, AsyncMock # Standard mocking libraries
import uuid
import time
import random
from typing import Dict, Any, Optional, List, Union
import datetime
import json

# --- Pydantic for MIZ OKI Payload Validation (Conceptual) ---
# Import Pydantic if available for schema validation in tests
try:
    from pydantic import BaseModel, Field, ValidationError, validator
    PYDANTIC_AVAILABLE = True

    # Define a basic Pydantic model for MIZ OKI Payloads (adapt as needed)
    class MizOkiPayload(BaseModel):
        miz_oki_version: str = "3.0"
        request_id: str
        trace_id: Optional[str] = None
        workflow_execution_id: Optional[str] = None
        step_id: Optional[str] = None
        timestamp: str # ISO format string
        source_component: Optional[str] = None
        target_component: Optional[str] = None
        status: Optional[str] = None # For responses
        payload: Optional[Dict[str, Any]] = None
        error_details: Optional[List[Dict[str, Any]]] = None
        metadata: Optional[Dict[str, Any]] = None

        @validator('timestamp')
        def validate_timestamp_isoformat(cls, v):
            try:
                datetime.datetime.fromisoformat(v.replace('Z', '+00:00'))
                return v
            except ValueError:
                raise ValueError('Timestamp must be in ISO 8601 format')

except ImportError:
    PYDANTIC_AVAILABLE = False
    class BaseModel: pass; class Field: pass; class ValidationError(Exception): pass; validator = lambda x: x # Dummy decorator
    logging.warning("Pydantic not installed. Cannot perform MIZ OKI payload validation in tests.")

# --- Assume necessary components/mocks are importable or defined ---
# Using refined mocks from previous cells' rework analysis

# Mock Config (Simple dict for testing)
class MockEnhancedConfig(dict):
     def __init__(self, data): super().__init__(data)
     def get(self, key, default=None): # Simplified get for testing
          try: return self[key]
          except KeyError: return default

# Mock KG Tool API Service Proxy
class MockKGTool:
    _decision_logs = {}
    async def execute_query(self, request: Dict): logger.info(f"Mock KG Tool API: execute_query async"); await asyncio.sleep(0.01); return {"status": "success", "payload": {"results": [{'mock_result': 1}]}}
    async def save_decision_record(self, request: Dict): record = request.get("payload",{}); decision_id = record.get('decision_id'); logger.info(f"Mock KG Tool API: save_decision_record async for {decision_id}"); await asyncio.sleep(0.01); self._decision_logs[decision_id] = record; return {"status": "success"}
    async def retrieve_decision_record(self, request: Dict): decision_id = request.get("payload",{}).get("decision_id"); logger.info(f"Mock KG Tool API: retrieve_decision_record async for {decision_id}"); await asyncio.sleep(0.01); record = self._decision_logs.get(decision_id); return {"status": "success" if record else "not_found", "payload": {"decision_record": record}}
    async def get_entity_endpoint(self, request: Dict): logger.info("Mock KG Tool API: get_entity async"); await asyncio.sleep(0.01); return {"status": "success", "payload": {"entity_data": {"mock_prop": "value"}}}
    async def search_vector_endpoint(self, request: Dict): logger.info("Mock KG Tool API: search_vector_index async"); await asyncio.sleep(0.01); return {"status": "success", "payload": {"results": [("mock_id", 0.9, {})]}}
    # Add other methods used by tests if necessary

# Mock FM Client API Service Proxy
class MockFoundationModelClient:
    async def generate_text(self, input_data: Dict): logger.info(f"Mock FMClient API: generate_text async"); await asyncio.sleep(0.05); return {"status": "success", "payload": {"generated_text": "Mock async generation"}, "metadata": {"provider": "mock"}}
    async def generate_embedding(self, input_data: Dict): logger.info(f"Mock FMClient API: generate_embedding async"); await asyncio.sleep(0.02); return {"status": "success", "payload": {"embedding": [random.random()]*10}, "metadata": {"provider": "mock"}}
    async def analyze(self, input_data: Dict): logger.info(f"Mock FMClient API: analyze async"); await asyncio.sleep(0.03); return {"status": "success", "payload": {'sentiment': 'neutral'}, "metadata": {"provider": "mock"}}
    # Add other methods used by tests if necessary

# Mock Vertex Workflow Executions Client (Refined from Cell 16 Rework)
# Define dummy state enum if SDK not available
if not VERTEX_WORKFLOWS_SDK_AVAILABLE:
    class ExecutionState: ACTIVE="ACTIVE"; SUCCEEDED="SUCCEEDED"; FAILED="FAILED"; CANCELLED="CANCELLED"; SUSPENDED="SUSPENDED"
    class DummyProto: pass
    class executions_v1: ExecutionState = ExecutionState; class ExecutionsAsyncClient: pass; class Execution(DummyProto): pass; class CreateExecutionRequest: pass; class GetExecutionRequest: pass; class CancelExecutionRequest: pass; class ListExecutionsRequest: pass; class ListExecutionsResponse: pass; ExecutionView = type('Enum', (), {'BASIC': 1, 'FULL': 2})()
    class google_api_exceptions: class NotFound(Exception): pass; class FailedPrecondition(Exception): pass; class GoogleAPIError(Exception): pass

class MockVertexWorkflowClient:
    _executions = {}
    # Use real State enum if SDK available, else mock
    State = executions_v1.Execution.State if VERTEX_WORKFLOWS_SDK_AVAILABLE else ExecutionState
    ExecutionView = executions_v1.ExecutionView if VERTEX_WORKFLOWS_SDK_AVAILABLE else type('Enum', (), {'BASIC': 1, 'FULL': 2})()
    ExecutionProto = executions_v1.Execution if VERTEX_WORKFLOWS_SDK_AVAILABLE else MagicMock # Use MagicMock as proto fallback
    CreateExecutionRequest = executions_v1.CreateExecutionRequest if VERTEX_WORKFLOWS_SDK_AVAILABLE else MagicMock
    GetExecutionRequest = executions_v1.GetExecutionRequest if VERTEX_WORKFLOWS_SDK_AVAILABLE else MagicMock
    CancelExecutionRequest = executions_v1.CancelExecutionRequest if VERTEX_WORKFLOWS_SDK_AVAILABLE else MagicMock
    ListExecutionsRequest = executions_v1.ListExecutionsRequest if VERTEX_WORKFLOWS_SDK_AVAILABLE else MagicMock
    ListExecutionsResponse = executions_v1.ListExecutionsResponse if VERTEX_WORKFLOWS_SDK_AVAILABLE else MagicMock

    async def create_execution(self, request) -> ExecutionProto:
        parent = request.parent
        # execution_input = request.execution # Accessing proto fields directly might fail with MagicMock
        input_arg = getattr(request.execution, 'argument', '{}') # Safely get argument

        project, location, workflow = parent.split('/')[1], parent.split('/')[3], parent.split('/')[5]
        exec_id_suffix = uuid.uuid4().hex[:12]; exec_name = f"projects/{project}/locations/{location}/workflows/{workflow}/executions/{exec_id_suffix}"
        logger.info(f"MOCK CreateExecution: {exec_name}")
        await asyncio.sleep(0.05)
        status = self.State.ACTIVE
        # Simulate suspension based on input payload for testing
        try:
            miz_oki_input = json.loads(input_arg)
            if miz_oki_input.get("payload", {}).get("force_suspend") or (random.random() < 0.1 and miz_oki_input.get("payload", {}).get("allow_suspend")):
                 status = self.State.SUSPENDED
                 logger.info(f"MOCK Execution {exec_name} starting in SUSPENDED state.")
        except json.JSONDecodeError: pass # Ignore if argument isn't valid JSON

        start_time_dt = datetime.now(datetime.timezone.utc)
        self._executions[exec_name] = {"name": exec_name, "state": status, "argument": input_arg, "start_time": start_time_dt, "result": None, "error": None}
        # Return a mock object mimicking the Execution proto structure
        mock_proto = MagicMock(spec=self.ExecutionProto)
        mock_proto.name = exec_name
        mock_proto.state = status
        mock_proto.argument = input_arg
        mock_proto.start_time = start_time_dt
        mock_proto.result = None
        mock_proto.error = None
        return mock_proto

    async def get_execution(self, request) -> ExecutionProto:
        exec_name = request.name; logger.info(f"MOCK GetExecution: {exec_name}"); await asyncio.sleep(0.02)
        exec_data = self._executions.get(exec_name)
        if not exec_data: raise google_api_exceptions.NotFound(f"Exec {exec_name} not found")

        # Simulate completion/failure progression for ACTIVE state
        if exec_data["state"] == self.State.ACTIVE:
            if random.random() < 0.3: # 30% chance to succeed
                 exec_data["state"] = self.State.SUCCEEDED
                 exec_data["result"] = json.dumps({"output": "mock_success", "final_status": "OK"}) # Simulate MIZ OKI output
                 logger.info(f"MOCK Execution {exec_name} transitioned to SUCCEEDED.")
            elif random.random() < 0.1: # 10% chance to fail (of remaining 70%)
                 exec_data["state"] = self.State.FAILED
                 exec_data["error"] = {"message": "Simulated step failure"} # Mimic error structure
                 logger.info(f"MOCK Execution {exec_name} transitioned to FAILED.")
            # else: remains ACTIVE

        # Return a mock object mimicking the Execution proto structure
        mock_proto = MagicMock(spec=self.ExecutionProto)
        for k, v in exec_data.items(): setattr(mock_proto, k, v)
        # Handle proto-specific attributes if SDK is available
        if VERTEX_WORKFLOWS_SDK_AVAILABLE: mock_proto._pb = exec_data # Allow access to underlying dict if needed
        return mock_proto

    async def cancel_execution(self, request) -> ExecutionProto:
        exec_name = request.name; logger.info(f"MOCK CancelExecution: {exec_name}"); await asyncio.sleep(0.05)
        if exec_name in self._executions and self._executions[exec_name]["state"] in [self.State.ACTIVE, self.State.SUSPENDED]:
            self._executions[exec_name]["state"] = self.State.CANCELLED
            logger.info(f"MOCK Execution {exec_name} transitioned to CANCELLED.")
            mock_proto = MagicMock(spec=self.ExecutionProto)
            for k, v in self._executions[exec_name].items(): setattr(mock_proto, k, v)
            if VERTEX_WORKFLOWS_SDK_AVAILABLE: mock_proto._pb = self._executions[exec_name]
            return mock_proto
        logger.warning(f"MOCK CancelExecution: Exec {exec_name} not found or already finished.")
        raise google_api_exceptions.FailedPrecondition(f"Exec {exec_name} not cancellable")

    async def list_executions(self, request) -> ListExecutionsResponse:
        logger.info(f"MOCK ListExecutions: parent={request.parent}, filter={request.filter}"); await asyncio.sleep(0.05)
        results = []
        parent_prefix = request.parent + "/executions/"
        for name, data in self._executions.items():
            if name.startswith(parent_prefix):
                 matches_filter = True
                 # Simple state filter parsing
                 if request.filter and 'state = "' in request.filter:
                      try:
                          expected_state_name = request.filter.split('"')[1]
                          expected_state = getattr(self.State, expected_state_name)
                          matches_filter = (data.get("state") == expected_state)
                      except (IndexError, KeyError, AttributeError):
                          logger.warning(f"Could not parse state filter: {request.filter}")
                          matches_filter = False
                 if matches_filter:
                      mock_proto = MagicMock(spec=self.ExecutionProto)
                      for k, v in data.items(): setattr(mock_proto, k, v)
                      if VERTEX_WORKFLOWS_SDK_AVAILABLE: mock_proto._pb = data
                      results.append(mock_proto)
        # Return mock response object
        mock_response = MagicMock(spec=self.ListExecutionsResponse)
        mock_response.executions = results
        return mock_response

    # Mock for Pub/Sub signal pattern (used by Human API test)
    async def publish_approval(self, topic, data): # Keep simple mock for testing API logic
        logger.info(f"MOCK Publishing approval signal to {topic}: {data}")
        await asyncio.sleep(0.02); exec_id = data.get("execution_id")
        if exec_id in self._executions and self._executions[exec_id]["state"] == self.State.SUSPENDED:
             new_state = self.State.ACTIVE if data.get("approved") else self.State.FAILED
             self._executions[exec_id]["state"] = new_state
             logger.info(f"MOCK Execution {exec_id} transitioned to {new_state} due to signal.")
             return True
        logger.warning(f"MOCK Approval signal ignored: Exec {exec_id} not found or not SUSPENDED.")
        return False

# Mock ADK Tool API Service Proxy
class MockAdkTool:
    def __init__(self, tool_name="mock_tool"):
        self.tool_name = tool_name

    async def __call__(self, request: Dict): # Expects MIZ OKI request dict
        logger.info(f"Mock ADK Tool API '{self.tool_name}': Called async with request: {request.get('request_id')}")
        await asyncio.sleep(random.uniform(0.05, 0.15))
        # Simulate potential failure
        if request.get("payload",{}).get("force_fail") or (request.get("payload",{}).get("fail_sometimes") and random.random() < 0.3):
            logger.warning(f"Mock ADK Tool '{self.tool_name}': Simulating failure.")
            return {
                "miz_oki_version": request.get("miz_oki_version", "3.0"),
                "request_id": request.get("request_id"), "trace_id": request.get("trace_id"),
                "timestamp": datetime.now(datetime.timezone.utc).isoformat(),
                "source_component": self.tool_name, "target_component": request.get("source_component"),
                "status": "failed",
                "error_details": [{"code": "SIMULATED_ERROR", "message": f"Simulated failure in {self.tool_name}"}]
            }
        # Simulate success
        return {
            "miz_oki_version": request.get("miz_oki_version", "3.0"),
            "request_id": request.get("request_id"), "trace_id": request.get("trace_id"),
            "timestamp": datetime.now(datetime.timezone.utc).isoformat(),
            "source_component": self.tool_name, "target_component": request.get("source_component"),
            "status": "success",
            "payload": {"tool_result": f"Processed by {self.tool_name}", "input_payload_preview": str(request.get("payload"))[:100]}
        }

# --- Setup Logging for Tests ---
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')
logger = logging.getLogger('MIZ-OKI.IntegrationTest')

# --- Test Suite (Reworked) ---
class TestMIZOKIIntegrationVertexAsync(unittest.IsolatedAsyncioTestCase):
    """Integration tests using Vertex AI/ADK architecture (with mocks)."""

    async def asyncSetUp(self):
        logger.info("Setting up MIZ OKI Vertex/ADK Async Integration Test Suite...")
        self.maxDiff = None # Show full diff on assertion failure
        self.config_data = { # Simple dict for test config
            'gcp': {'project_id': 'test-proj-vertex', 'region': 'test-region-vertex', 'gcs_bucket_name': 'test-bucket'},
            'kg': {'storage_type': 'neo4j', 'vector_index_name': 'test_index'},
            'foundation_models': {'defaults': {'llama4_scout': 'mock-llama3-8b'}, 'keys': {'vertex': 'mock_auth'}},
            'xai': {'storage_type': 'kg'},
            'system_thresholds': {'human_review_confidence_threshold': 0.75},
            'vertex_ai': {'default_workflow_id': 'test-workflow-v1'},
            'miz_oki_schema_version': '3.0' # Added
        }
        # Inject mock instances (representing deployed API services/clients)
        self.kg_tool = MockKGTool()
        self.fm_client = MockFoundationModelClient()
        self.workflow_client = MockVertexWorkflowClient() # Use refined mock
        self.data_processing_tool = MockAdkTool("data_processing_tool")
        self.analysis_tool = MockAdkTool("analysis_tool")
        # Clear mock state before each test
        self.workflow_client._executions.clear()
        self.kg_tool._decision_logs.clear()

    async def asyncTearDown(self):
        logger.info("Tearing down MIZ OKI Vertex/ADK Async Integration Test Suite.")

    def _validate_miz_oki(self, data: Dict, is_request: bool = False):
        """Uses Pydantic to validate MIZ OKI structure if available."""
        if not PYDANTIC_AVAILABLE: return # Skip if Pydantic not installed
        try:
            MizOkiPayload(**data) # Validate against the Pydantic model
            # Add more specific checks if needed (e.g., payload content based on status)
        except ValidationError as e:
            self.fail(f"MIZ OKI Payload Validation Failed:\n{e}\nPayload: {json.dumps(data, indent=2)}")

    # --- Test Cases ---
    def test_01_config_loading_access(self):
        logger.info("Running test: test_01_config_loading_access")
        # Test accessing config data used in setup
        self.assertEqual(self.config_data['gcp']['project_id'], 'test-proj-vertex')
        logger.info("Config access test passed.")

    async def test_02_infrastructure_kg_tool_connectivity(self):
        logger.info("Running test: test_02_infrastructure_kg_tool_connectivity")
        # Simulate a MIZ OKI request to the KG Tool proxy
        request_data = {"payload": {"query": "MATCH (n) RETURN count(n)"}}
        response = await self.kg_tool.execute_query(request=request_data)
        self._validate_miz_oki(response) # Validate response structure
        self.assertEqual(response.get("status"), "success")
        self.assertIsInstance(response.get("payload", {}).get("results"), list)
        logger.info("KG Tool API connectivity test passed.")

    async def test_03_fmclient_invocation_async(self):
        logger.info("Running test: test_03_fmclient_invocation_async")
        # Test text generation
        gen_request = {"payload": {"prompt": "Test prompt", "model_alias": "mock-model"}}
        response_gen = await self.fm_client.generate_text(input_data=gen_request)
        self._validate_miz_oki(response_gen)
        self.assertEqual(response_gen.get("status"), "success")
        self.assertIn("Mock async generation", response_gen.get("payload", {}).get("generated_text", ""))
        # Test embedding
        emb_request = {"payload": {"data": "Test embed", "model_alias": "mock-emb"}}
        response_emb = await self.fm_client.generate_embedding(input_data=emb_request)
        self._validate_miz_oki(response_emb)
        self.assertEqual(response_emb.get("status"), "success")
        self.assertIsInstance(response_emb.get("payload", {}).get("embedding"), list)
        logger.info("FM Client API async invocation test passed.")

    async def test_04_vertex_workflow_start_and_status(self):
        logger.info("Running test: test_04_vertex_workflow_start_and_status")
        workflow_id = self.config_data['vertex_ai']['default_workflow_id']
        input_payload = {"input_param": "value1", "allow_suspend": False} # Prevent suspension for this test
        # Construct the MIZ OKI payload that the AWE service (or similar) would pass
        miz_oki_input_for_workflow = {
            "miz_oki_version": "3.0", "request_id": "req-04", "trace_id": "trace-04",
            "source_component": "TestRunner", "target_component": f"Workflow:{workflow_id}:Step1",
            "payload": input_payload
        }
        # Prepare the request for the Vertex AI client proxy
        request = self.workflow_client.CreateExecutionRequest(
            parent=f"projects/{self.config_data['gcp']['project_id']}/locations/{self.config_data['gcp']['region']}/workflows/{workflow_id}",
            execution=self.workflow_client.ExecutionProto(argument=json.dumps(miz_oki_input_for_workflow)) # Pass MIZ OKI as argument string
        )
        execution = await self.workflow_client.create_execution(request=request)
        execution_name = execution.name
        self.assertTrue(execution_name.startswith(f"projects/{self.config_data['gcp']['project_id']}/locations/"))
        self.assertTrue(execution_name.endswith(execution.name.split('/')[-1])) # Check format
        logger.info(f"Workflow started with execution Name: {execution_name}")

        # Wait briefly and check status
        await asyncio.sleep(0.1)
        status_request = self.workflow_client.GetExecutionRequest(name=execution_name)
        status_execution = await self.workflow_client.get_execution(request=status_request)
        self.assertEqual(status_execution.name, execution_name)
        # Check against possible states (ACTIVE, SUCCEEDED, FAILED - SUSPENDED prevented by input)
        self.assertIn(status_execution.state, [self.workflow_client.State.ACTIVE, self.workflow_client.State.SUCCEEDED, self.workflow_client.State.FAILED])
        logger.info(f"Retrieved workflow status: {status_execution.state.name if hasattr(status_execution.state, 'name') else status_execution.state}")
        logger.info("Vertex Workflow start and status test passed.")

    async def test_05_vertex_workflow_human_approval_signal(self):
        logger.info("Running test: test_05_vertex_workflow_human_approval_signal")
        workflow_id = "approval-workflow-v1"
        input_payload = {"requires_approval": True, "force_suspend": True} # Force suspension
        miz_oki_input_for_workflow = {"payload": input_payload, "trace_id": "trace-05"}
        # Start workflow, forcing suspension via mock logic
        request = self.workflow_client.CreateExecutionRequest(
            parent=f"projects/{self.config_data['gcp']['project_id']}/locations/{self.config_data['gcp']['region']}/workflows/{workflow_id}",
            execution=self.workflow_client.ExecutionProto(argument=json.dumps(miz_oki_input_for_workflow))
        )
        execution = await self.workflow_client.create_execution(request=request)
        execution_id = execution.name

        # Verify it's suspended
        status_request = self.workflow_client.GetExecutionRequest(name=execution_id)
        status_execution = await self.workflow_client.get_execution(request=status_request)
        self.assertEqual(status_execution.state, self.workflow_client.State.SUSPENDED, "Workflow did not suspend as expected.")
        logger.info(f"Workflow {execution_id} correctly suspended.")

        # Simulate approval signal (as if sent via Pub/Sub and handled by Human API backend calling the mock)
        approval_signal_topic = "mock-approval-topic"
        signal_payload = {"execution_id": execution_id, "approved": True, "comments": "Approved by test", "approver": "test_user"}
        resumed = await self.workflow_client.publish_approval(approval_signal_topic, signal_payload) # Call mock method directly
        self.assertTrue(resumed, "Mock approval signal should indicate resumption.")

        # Verify it resumed (became ACTIVE)
        status_after = await self.workflow_client.get_execution(request=status_request)
        self.assertEqual(status_after.state, self.workflow_client.State.ACTIVE, "Workflow should be ACTIVE after approval.")
        logger.info("Vertex Workflow human approval signal test passed.")

    async def test_06_conceptual_workflow_step_tool_call(self):
        logger.info("Running test: test_06_conceptual_workflow_step_tool_call")
        # Simulate input coming from previous workflow step (MIZ OKI format)
        miz_oki_input = {
            "miz_oki_version": "3.0", "request_id": "req-step-06", "trace_id": "trace-06",
            "workflow_execution_id": "exec-123", "step_id": "data_processing_step",
            "source_component": "WorkflowOrchestrator", "target_component": "data_processing_tool",
            "payload": {"data_uri": "gs://bucket/data.csv", "param": 123}
        }
        self._validate_miz_oki(miz_oki_input, is_request=True) # Validate input

        # Call the mock tool API proxy
        response = await self.data_processing_tool(request=miz_oki_input)
        self._validate_miz_oki(response) # Validate output
        self.assertEqual(response.get("status"), "success")
        # Check if the tool correctly identified the caller
        self.assertEqual(response.get("target_component"), miz_oki_input["source_component"])
        self.assertEqual(response.get("payload", {}).get("input_payload_preview"), str(miz_oki_input["payload"])[:100])
        logger.info("Conceptual workflow step tool API call test passed.")

    async def test_07_workflow_failure_status(self):
        logger.info("Running test: test_07_workflow_failure_status")
        workflow_id = "failing-workflow-v1"
        miz_oki_input_for_workflow = {"payload": {}, "trace_id": "trace-07"}
        request = self.workflow_client.CreateExecutionRequest(
            parent=f"projects/{self.config_data['gcp']['project_id']}/locations/{self.config_data['gcp']['region']}/workflows/{workflow_id}",
            execution=self.workflow_client.ExecutionProto(argument=json.dumps(miz_oki_input_for_workflow))
        )
        execution = await self.workflow_client.create_execution(request=request); exec_id = execution.name

        # Force mock to fail state
        if exec_id in self.workflow_client._executions:
            self.workflow_client._executions[exec_id]['state'] = self.workflow_client.State.FAILED
            self.workflow_client._executions[exec_id]['error'] = {'message': 'Simulated step failure'}
        else:
            self.fail(f"Mock execution {exec_id} not found for forcing failure.")

        status_request = self.workflow_client.GetExecutionRequest(name=exec_id)
        status = await self.workflow_client.get_execution(request=status_request)
        self.assertEqual(status.state, self.workflow_client.State.FAILED)
        # Check error details if SDK provides them (mock stores it)
        self.assertIsNotNone(status.error)
        self.assertEqual(status.error.get('message'), 'Simulated step failure')
        logger.info("Workflow failure status test passed.")

    async def test_08_tool_api_error_handling(self):
        logger.info("Running test: test_08_tool_api_error_handling")
        # Simulate input for a tool call
        miz_oki_input = {
            "miz_oki_version": "3.0", "request_id": "req-step-08", "trace_id": "trace-08",
            "payload": {"data_uri": "gs://bucket/data.csv", "force_fail": True} # Use flag for mock tool
        }
        # Call the mock tool API proxy, expecting failure
        response = await self.data_processing_tool(request=miz_oki_input)
        self._validate_miz_oki(response) # Validate error response structure
        self.assertEqual(response.get("status"), "failed") # Changed from 'error' to 'failed' based on mock
        self.assertIsNotNone(response.get("error_details"))
        self.assertEqual(response["error_details"][0]["code"], "SIMULATED_ERROR")
        logger.info("Tool API error handling test passed.")

    async def test_09_complex_workflow_simulation(self):
        logger.info("Running test: test_09_complex_workflow_simulation")
        # Simulate sequence of tool calls as if orchestrated by a workflow
        miz_oki_context = {"miz_oki_version": "3.0", "request_id": "req-complex-09", "trace_id": "trace-09", "workflow_execution_id": "exec-complex"}

        # Step 1: Data Processing Tool
        step1_input = {**miz_oki_context, "step_id": "step1_data_proc", "payload": {"input_data": "start"}}
        self._validate_miz_oki(step1_input, is_request=True)
        step1_response = await self.data_processing_tool(request=step1_input)
        self._validate_miz_oki(step1_response)
        self.assertEqual(step1_response.get("status"), "success")
        processed_data = step1_response.get("payload", {}).get("tool_result")

        # Step 2: Analysis Tool (using output of Step 1)
        step2_input = {**miz_oki_context, "step_id": "step2_analysis", "payload": {"processed_data": processed_data}}
        self._validate_miz_oki(step2_input, is_request=True)
        step2_response = await self.analysis_tool(request=step2_input)
        self._validate_miz_oki(step2_response)
        self.assertEqual(step2_response.get("status"), "success")
        analysis_result = step2_response.get("payload", {}).get("tool_result")

        # Step 3: Generation Tool (using output of Step 2)
        step3_input = {**miz_oki_context, "step_id": "step3_generate", "payload": {"prompt": f"Generate report based on: {analysis_result}", "model_alias": "mock-model"}}
        self._validate_miz_oki(step3_input, is_request=True)
        step3_response = await self.fm_client.generate_text(input_data=step3_input)
        self._validate_miz_oki(step3_response)
        self.assertEqual(step3_response.get("status"), "success")
        self.assertIsNotNone(step3_response.get("payload", {}).get("generated_text"))

        logger.info("Complex workflow simulation API call sequence test passed.")

    async def test_10_xai_logging_retrieval(self):
        logger.info("Running test: test_10_xai_logging_retrieval")
        decision_id = f"xai_test_{uuid.uuid4().hex[:6]}"
        record_to_save = {"decision_id": decision_id, "component": "TestComponent", "decision": {"action": "approve"}, "chain_of_thought": ["Step A", "Step B"], "timestamp": datetime.now(datetime.timezone.utc).isoformat()}
        trace_id = "trace-xai-10"

        # Simulate XAI service calling KG Tool API proxy to save
        save_request = {"payload": {"record": record_to_save}, "trace_id": trace_id}
        save_response = await self.kg_tool.save_decision_record(request=save_request) # Call mock proxy method
        self.assertEqual(save_response.get("status"), "success")

        # Simulate XAI service calling KG Tool API proxy to retrieve
        retrieve_request = {"payload": {"decision_id": decision_id}, "trace_id": trace_id}
        retrieve_response = await self.kg_tool.retrieve_decision_record(request=retrieve_request) # Call mock proxy method
        self._validate_miz_oki(retrieve_response) # Check response structure
        self.assertEqual(retrieve_response.get("status"), "success")
        retrieved_record = retrieve_response.get("payload", {}).get("decision_record")
        self.assertIsNotNone(retrieved_record)
        self.assertEqual(retrieved_record.get("decision_id"), decision_id)
        # Compare relevant fields (ignoring potential timestamp differences if not mocked precisely)
        self.assertEqual(retrieved_record.get("component"), record_to_save["component"])
        self.assertEqual(retrieved_record.get("decision"), record_to_save["decision"])
        self.assertEqual(retrieved_record.get("chain_of_thought"), record_to_save["chain_of_thought"])

        logger.info("XAI logging and retrieval via KG Tool API test passed.")

    # --- TODO: Add More Tests ---
    # - Test MIZ OKI payload validation failures (if Pydantic available)
    # - Test specific error conditions from Vertex AI API (NotFound, PermissionDenied) using mock exceptions
    # - Test interactions between more components (e.g., PO -> HDE -> KG) via mocked APIs
    # - Test edge cases (empty inputs, large payloads if mocks support it)
    # - Test workflow cancellation

# --- Main execution block ---
if __name__ == '__main__':
    # Ensure asyncio event loop is managed correctly for testing
    # unittest.main() handles this when run directly
    unittest.main()

SyntaxError: invalid syntax (<ipython-input-14-0228133b0c76>, line 47)

In [15]:
# Cell 10: Business Impact Monitoring (Reworked)
# Status: Uses real KG Tool API proxy and real Vertex Workflow Executions client proxy.
#         Includes robust workflow polling and improved error handling.

import logging
import pandas as pd
import asyncio
import random
import datetime
import uuid
import json # Added for workflow interaction
import time # Added for polling
from typing import Dict, Any, Optional, List, Union
import numpy as np # Added for nan handling

# --- Assume necessary components are injected or globally available ---
try:
    # Use CONFIG_OBJ loaded in Cell 1
    if 'CONFIG_OBJ' not in globals() or not CONFIG_OBJ: raise NameError("CONFIG_OBJ not found or is None")
    if not isinstance(CONFIG_OBJ, EnhancedConfig): raise NameError("CONFIG_OBJ is not an EnhancedConfig instance")

    # Use KG Tool API Proxy (representing deployed Cell 3 service)
    if 'kg_tool_service_instance' not in globals(): raise NameError("kg_tool_service_instance proxy not found")

    # Use REAL Vertex Workflow Executions Client Proxy (representing client from Cell 16)
    if '_workflow_executions_client' not in globals(): raise NameError("_workflow_executions_client not found")

    _config_obj = CONFIG_OBJ
    _kg_tool_proxy = kg_tool_service_instance
    _vertex_workflow_client_proxy = _workflow_executions_client

    # Import real exceptions/types if SDK available (check from Cell 16/Cell 7)
    if 'VERTEX_WORKFLOWS_SDK_AVAILABLE' not in globals(): VERTEX_WORKFLOWS_SDK_AVAILABLE = False # Assume false if not defined
    if VERTEX_WORKFLOWS_SDK_AVAILABLE:
         from google.api_core import exceptions as google_api_exceptions
         from google.cloud.workflows import executions_v1
         Execution = executions_v1.Execution
         ExecutionState = executions_v1.Execution.State
         CreateExecutionRequest = executions_v1.CreateExecutionRequest
         GetExecutionRequest = executions_v1.GetExecutionRequest
         logger.debug("Using real Vertex AI SDK types/exceptions for Cell 10.")
    else: # Use mock exceptions/types if SDK was unavailable
         from unittest.mock import MagicMock
         google_api_exceptions = MagicMock(); google_api_exceptions.NotFound = type('NotFound', (Exception,), {}); google_api_exceptions.GoogleAPIError = type('GoogleAPIError', (Exception,), {})
         executions_v1 = MagicMock(); Execution = MagicMock(); ExecutionState = MagicMock(); ExecutionState.ACTIVE="ACTIVE"; ExecutionState.SUCCEEDED = "SUCCEEDED"; ExecutionState.FAILED = "FAILED"; ExecutionState.CANCELLED = "CANCELLED"; ExecutionState.SUSPENDED="SUSPENDED"; CreateExecutionRequest = MagicMock(); GetExecutionRequest = MagicMock()
         logger.warning("Using mock Vertex AI SDK types/exceptions for Cell 10.")

    _real_dependencies = True
    logger.debug("Using real dependencies in Cell 10 (Reworked).")

except NameError as e:
    logger.warning(f"Dependency Error in Cell 10 ({e}). Using Mocks/Placeholders.")
    _real_dependencies = False
    # --- Mock/Placeholder Setup ---
    class MockKGTool: async def execute_query(self, request): await asyncio.sleep(0.02); return {"status": "success", "payload": {"results": [{'value': random.uniform(5, 10)}]}}
    class MockVertexWorkflowClient:
        _mock_result = {}; State = ExecutionState
        async def create_execution(self, request): await asyncio.sleep(0.05); self._mock_result = {"status": "success", "payload": {"metric_value": random.uniform(0.04, 0.07)}}; return MagicMock(name=f"metric_exec_{uuid.uuid4().hex[:8]}")
        async def get_execution(self, request): await asyncio.sleep(0.01); return MagicMock(state=self.State.SUCCEEDED, result=json.dumps(self._mock_result)) # Simulate immediate success for mock
    from dataclasses import dataclass, field
    @dataclass class MockGcpConfig: project_id: str = "mock-project"; region: str = "mock-region"
    @dataclass class MockBusinessImpact: kpis: Dict = field(default_factory=lambda: {'ROAS': {'data_source': 'kg', 'query': 'ROAS_QUERY', 'target': 8.0}, 'ConversionRate': {'data_source': 'workflow', 'workflow_id': 'conv_rate_wf', 'target': 0.05}})
    @dataclass class MockEnhancedConfig: gcp: MockGcpConfig = field(default_factory=MockGcpConfig); business_impact: MockBusinessImpact = field(default_factory=MockBusinessImpact); miz_oki_schema_version: str = "3.0"; def get(self, key, default=None): return getattr(self, key, default)
    _config_obj = MockEnhancedConfig(); _kg_tool_proxy = MockKGTool(); _vertex_workflow_client_proxy = MockVertexWorkflowClient()
    google_api_exceptions = MagicMock(); google_api_exceptions.NotFound = type('NotFound', (Exception,), {}); google_api_exceptions.GoogleAPIError = type('GoogleAPIError', (Exception,), {})
    executions_v1 = MagicMock(); Execution = MagicMock(); ExecutionState = MagicMock(); ExecutionState.ACTIVE="ACTIVE"; ExecutionState.SUCCEEDED = "SUCCEEDED"; ExecutionState.FAILED = "FAILED"; ExecutionState.CANCELLED = "CANCELLED"; ExecutionState.SUSPENDED="SUSPENDED"; CreateExecutionRequest = MagicMock(); GetExecutionRequest = MagicMock()
    # --- End Mock Setup ---

logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')
logger = logging.getLogger('MIZ-OKI.BusinessImpact')

class BusinessImpactDashboard:
    """Monitors KPIs asynchronously using real KG Tool API proxy and Vertex Workflow Client proxy."""

    def __init__(self, config: EnhancedConfig, kg_tool_proxy: Any, workflow_client_proxy: Optional[Any] = None):
        if not config or not kg_tool_proxy:
             raise InitializationError("BusinessImpactDashboard requires config and KG Tool proxy.")
        self.config = config
        self.kg_tool = kg_tool_proxy
        self.workflow_client = workflow_client_proxy # Use the injected client proxy
        self.project = config.gcp.project_id
        self.location = config.gcp.region
        self.target_metrics: Dict[str, Dict] = {}
        # Use object dtype initially to handle mixed types and NAs gracefully
        self.actual_metrics_history = pd.DataFrame(columns=['time_period']).astype({'time_period': 'object'})
        if not all([self.workflow_client, self.project, self.location]):
             logger.warning("BusinessImpactDashboard initialized without a valid Vertex Workflow client or GCP project/location. Workflow-based KPIs will fail.")
        self._load_config_metrics()
        logger.info(f"BusinessImpactDashboard initialized (Reworked). Tracking: {list(self.target_metrics.keys())}")

    def _load_config_metrics(self):
        """Loads KPI definitions from the configuration."""
        kpis_section = self.config.business_impact.kpis if hasattr(self.config, 'business_impact') else {}
        if not kpis_section:
            logger.warning("No KPIs found in config 'business_impact.kpis'. Dashboard will be empty.")
            return

        self.target_metrics = kpis_section
        metric_names = list(self.target_metrics.keys())
        # Add columns to DataFrame if they don't exist, preserving existing data
        for metric in metric_names:
            if metric not in self.actual_metrics_history.columns:
                self.actual_metrics_history[metric] = pd.Series(dtype='float64') # Use float for metrics
        logger.info(f"Loaded {len(self.target_metrics)} target KPI configurations.")

    async def _fetch_metric_value(self, metric_name: str, metric_config: Dict, time_period: str, trace_id: Optional[str] = None) -> Optional[float]:
        """Fetches a single metric value via KG Tool API proxy or Vertex Workflow Client proxy."""
        data_source = metric_config.get('data_source')
        metric_value = None
        source_identifier = "N/A"
        request_id = f"metric_fetch_{metric_name}_{uuid.uuid4().hex[:6]}"
        logger.debug(f"Fetching metric '{metric_name}' for period '{time_period}' using source '{data_source}'...")

        try:
            if data_source == 'kg':
                query = metric_config.get('query')
                source_identifier = f"KG Tool Query: {query[:50]}..."
                if query and self.kg_tool:
                    # Call KG Tool API proxy
                    kg_request = {"payload": {"query": query, "parameters": {"time_period": time_period}}, "trace_id": trace_id, "request_id": request_id} # Pass time_period if query needs it
                    kg_response = await self.kg_tool.execute_query(request=kg_request) # Call proxy method
                    if kg_response.get("status") == "success" and (results := kg_response.get("payload", {}).get("results")):
                        # Expecting query to return a single row with a 'value' column
                        if results[0] is not None and 'value' in results[0] and results[0]['value'] is not None:
                            metric_value = float(results[0]['value'])
                            logger.debug(f"Metric '{metric_name}' fetched from KG: {metric_value}")
                        else: logger.warning(f"KG query for '{metric_name}' returned invalid result structure or null value: {results[0]}")
                    else: logger.warning(f"KG Tool API query for '{metric_name}' failed or returned no results: {kg_response.get('error_details')}")
                else: logger.warning(f"KG Tool proxy or query missing for metric: {metric_name}")

            elif data_source == 'workflow' or data_source == 'agent': # Treat 'agent' as a workflow call
                workflow_id = metric_config.get('workflow_id')
                source_identifier = f"Vertex Workflow: {workflow_id}"
                if workflow_id and self.workflow_client and VERTEX_WORKFLOWS_SDK_AVAILABLE:
                    parent = f"projects/{self.project}/locations/{self.location}/workflows/{workflow_id}"
                    # Prepare MIZ OKI input for the metric calculation workflow
                    wf_input_payload = {'time_period': time_period, 'metric_name': metric_name}
                    miz_oki_input = {
                        "miz_oki_version": self.config.miz_oki_schema_version,
                        "request_id": request_id, "trace_id": trace_id,
                        "source_component": "BusinessImpactDashboard", "target_component": workflow_id,
                        "payload": wf_input_payload
                    }
                    execution_args = json.dumps(miz_oki_input)
                    execution_proto = Execution(argument=execution_args) # Use real proto type
                    request = CreateExecutionRequest(parent=parent, execution=execution_proto) # Use real request type

                    try:
                        # Start execution via REAL client proxy
                        exec_response = await self.workflow_client.create_execution(request=request)
                        execution_name = exec_response.name
                        logger.info(f"Started metric workflow {execution_name} for '{metric_name}'. Polling for result...")

                        # --- Robust Polling ---
                        max_wait_seconds = 300; poll_interval = 10; start_poll = time.monotonic(); final_status = None
                        while time.monotonic() - start_poll < max_wait_seconds:
                             await asyncio.sleep(poll_interval)
                             try:
                                 status_request = GetExecutionRequest(name=execution_name) # Use real request type
                                 current_exec = await self.workflow_client.get_execution(request=status_request) # Call proxy
                                 final_status = current_exec.state
                             except google_api_exceptions.NotFound:
                                 logger.error(f"Workflow execution {execution_name} disappeared during polling.")
                                 final_status = ExecutionState.FAILED # Treat as failure
                                 break
                             except Exception as poll_err:
                                 logger.warning(f"Error polling workflow {execution_name}: {poll_err}. Retrying...")
                                 continue # Continue polling

                             if final_status in [ExecutionState.SUCCEEDED, ExecutionState.FAILED, ExecutionState.CANCELLED]:
                                  logger.info(f"Workflow {execution_name} finished polling with status: {final_status.name}")
                                  if final_status == ExecutionState.SUCCEEDED:
                                       wf_output_str = current_exec.result
                                       if wf_output_str:
                                            try:
                                                # Expecting workflow to return MIZ OKI response JSON string
                                                wf_output = json.loads(wf_output_str)
                                                if wf_output.get("status") == "success" and "metric_value" in wf_output.get("payload", {}):
                                                     metric_value = float(wf_output["payload"]["metric_value"])
                                                     logger.debug(f"Metric '{metric_name}' fetched from workflow: {metric_value}")
                                                else: logger.warning(f"Workflow '{workflow_id}' succeeded but returned invalid MIZ OKI payload or status: {wf_output_str[:200]}...")
                                            except json.JSONDecodeError as json_e: logger.warning(f"Workflow '{workflow_id}' succeeded but result was not valid JSON: {json_e}")
                                            except (ValueError, TypeError) as val_e: logger.warning(f"Workflow '{workflow_id}' succeeded but metric_value was invalid: {val_e}")
                                       else: logger.warning(f"Workflow '{workflow_id}' succeeded but returned no result string.")
                                  else: logger.warning(f"Workflow '{workflow_id}' for metric '{metric_name}' did not succeed. Status: {final_status.name}")
                                  break # Exit polling loop
                             else: logger.debug(f"Workflow {execution_name} running ({final_status.name})... polling again in {poll_interval}s")
                        else: # Loop finished without break (timeout)
                            logger.error(f"Timeout ({max_wait_seconds}s) waiting for workflow {execution_name} for metric '{metric_name}'.")
                        # --- End Polling ---
                    except google_api_exceptions.NotFound: logger.error(f"Workflow '{workflow_id}' not found.")
                    except google_api_exceptions.GoogleAPIError as api_e: logger.error(f"API Error interacting with workflow '{workflow_id}': {api_e}")
                    except Exception as wf_e: logger.error(f"Error during workflow execution/polling for '{metric_name}': {wf_e}", exc_info=True)
                elif not self.workflow_client or not VERTEX_WORKFLOWS_SDK_AVAILABLE: logger.warning(f"Vertex Workflow Client proxy/SDK unavailable for metric: {metric_name}")
                else: logger.warning(f"Workflow ID missing for metric: {metric_name}")

            elif data_source == 'manual' or data_source is None:
                source_identifier = "Manual/None"; logger.info(f"Metric '{metric_name}' requires manual input or has no source defined."); metric_value = None # Explicitly None
            else:
                source_identifier = f"Unsupported: {data_source}"; logger.error(f"Unsupported data source '{data_source}' for metric: {metric_name}")

        except asyncio.TimeoutError: logger.error(f"Timeout fetching metric '{metric_name}' from {source_identifier}.")
        except ConnectionError as ce: logger.error(f"Connection error fetching metric '{metric_name}': {ce}")
        except ValueError as ve: logger.error(f"Value error processing metric '{metric_name}': {ve}")
        except Exception as e: logger.error(f"Failed to fetch metric '{metric_name}' async from {source_identifier}: {e}", exc_info=True); metric_value = None

        # Final validation and conversion to float or None
        try:
            if metric_value is None or np.isnan(metric_value) or np.isinf(metric_value):
                return None
            return float(metric_value)
        except (ValueError, TypeError):
            logger.error(f"Final value for metric '{metric_name}' is not a valid number: {metric_value}")
            return None

    async def import_actual_metrics(self, time_period: str) -> bool:
        """Imports actual metric values asynchronously for a given time period."""
        logger.info(f"Importing actual metrics async for time period: {time_period}")
        new_metrics = {'time_period': time_period}
        any_success = False
        all_success = True
        fetch_tasks = []
        metric_names_ordered = list(self.target_metrics.keys())

        # Create tasks for fetching each metric value
        for metric_name in metric_names_ordered:
            if metric_name in self.target_metrics:
                fetch_tasks.append(
                    self._fetch_metric_value(metric_name, self.target_metrics[metric_name], time_period)
                )
            else:
                logger.warning(f"Metric '{metric_name}' defined in history columns but not in target_metrics config. Skipping fetch.")
                fetch_tasks.append(asyncio.sleep(0, result=None)) # Add placeholder task

        # Execute tasks concurrently
        metric_values_or_exceptions = await asyncio.gather(*fetch_tasks, return_exceptions=True)

        # Process results
        for i, value_or_exception in enumerate(metric_values_or_exceptions):
            metric_name = metric_names_ordered[i]
            if isinstance(value_or_exception, Exception):
                logger.error(f"Exception fetching metric '{metric_name}': {value_or_exception}", exc_info=False) # Avoid overly verbose logs for expected failures
                new_metrics[metric_name] = pd.NA # Use pandas NA for missing values
                all_success = False
            elif value_or_exception is None:
                logger.warning(f"Could not retrieve value for metric: {metric_name}")
                new_metrics[metric_name] = pd.NA
                all_success = False
            else:
                new_metrics[metric_name] = value_or_exception
                any_success = True

        # Update DataFrame safely
        try:
            # Ensure all expected columns exist before creating the new row DataFrame
            new_row_data = {col: new_metrics.get(col, pd.NA) for col in self.actual_metrics_history.columns}
            new_row = pd.DataFrame([new_row_data], columns=self.actual_metrics_history.columns)

            # Ensure correct dtypes before concat (especially for metrics which should be float)
            for metric_name in self.target_metrics.keys():
                 if metric_name in new_row.columns:
                     new_row[metric_name] = pd.to_numeric(new_row[metric_name], errors='coerce')

            # Append the new row
            self.actual_metrics_history = pd.concat([self.actual_metrics_history, new_row], ignore_index=True)

            # Optional: Persist DataFrame to GCS/BQ here
            # self.persist_history()

            logger.info(f"Finished importing metrics async for {time_period}. History size: {len(self.actual_metrics_history)}. AnySuccess: {any_success}, AllSuccess: {all_success}")
        except Exception as df_e:
            logger.error(f"Error updating metrics history DataFrame: {df_e}", exc_info=True)
            return False # Indicate failure to update history

        return any_success

    def calculate_kpi_trends(self, window: int = 5) -> Dict[str, Optional[float]]:
        """Calculates simple trends (slope) for KPIs over a rolling window (sync)."""
        trends = {}
        if len(self.actual_metrics_history) < 2:
            return {metric: None for metric in self.target_metrics} # Not enough data for trend

        # Ensure metrics columns are numeric, coercing errors
        numeric_df = self.actual_metrics_history.copy()
        for metric in self.target_metrics:
            if metric in numeric_df.columns:
                numeric_df[metric] = pd.to_numeric(numeric_df[metric], errors='coerce')

        relevant_history = numeric_df.tail(window)

        for metric in self.target_metrics:
            if metric in relevant_history.columns:
                values = relevant_history[metric].dropna()
                if len(values) >= 2:
                    # Simple linear regression slope calculation
                    x = np.arange(len(values))
                    try:
                        # Use numpy polyfit for slope
                        slope, _ = np.polyfit(x, values, 1)
                        trends[metric] = float(slope) if not np.isnan(slope) else None
                    except (np.linalg.LinAlgError, ValueError, TypeError):
                         trends[metric] = None # Handle cases where polyfit fails
                else:
                    trends[metric] = None # Not enough data points in window
            else:
                trends[metric] = None # Metric column doesn't exist
        return trends

    def generate_dashboard_data(self, time_period: Optional[str] = None) -> Dict[str, Any]:
        """Generates data structure for dashboard display (sync). Handles NA/NaN for JSON."""
        if self.actual_metrics_history.empty:
            logger.warning("Metrics history is empty. Cannot generate dashboard data.")
            return {"summary": {}, "trends": {}, "history": []}

        # Select the latest row or a specific time period
        if time_period:
            latest_data = self.actual_metrics_history[self.actual_metrics_history['time_period'] == time_period].tail(1)
            if latest_data.empty:
                 logger.warning(f"No data found for time period '{time_period}'. Using latest overall.")
                 latest_data = self.actual_metrics_history.tail(1)
        else:
            latest_data = self.actual_metrics_history.tail(1)

        if latest_data.empty: # Should not happen if history is not empty, but check anyway
             logger.error("Cannot select latest data row for dashboard.")
             return {"summary": {}, "trends": {}, "history": []}

        latest_row = latest_data.iloc[0]
        dashboard_data = {"summary": {}, "trends": {}, "history": []}

        # Generate Summary
        for metric_name, config in self.target_metrics.items():
            actual_raw = latest_row.get(metric_name)
            target = config.get('target') # Target can be None
            status = "Data Missing"; actual_display = "N/A"

            if pd.notna(actual_raw):
                try:
                    actual_float = float(actual_raw)
                    actual_display = f"{actual_float:.3f}" # Format for display
                    if target is not None:
                        try:
                            target_float = float(target)
                            lower_is_better = config.get('lower_is_better', False)
                            # Add tolerance for meeting target?
                            tolerance = config.get('target_tolerance', 0.01) # e.g., 1% tolerance
                            if lower_is_better:
                                is_meeting = actual_float <= target_float * (1 + tolerance)
                            else:
                                is_meeting = actual_float >= target_float * (1 - tolerance)
                            status = "Meeting Target" if is_meeting else ("Above Target" if not lower_is_better else "Below Target")
                        except (ValueError, TypeError):
                            status = "Invalid Target"; target = str(target) # Display target as string if invalid
                    else:
                        status = "No Target Set"
                except (ValueError, TypeError):
                     status = "Invalid Data"; actual_display = str(actual_raw) # Display raw value if invalid
            else:
                 status = "Data Missing" # Keep status if actual is NA

            dashboard_data["summary"][metric_name] = {
                "actual": actual_display,
                "target": target, # Keep target as is (could be None)
                "unit": config.get('unit', ''),
                "status": status
            }

        # Generate Trends
        dashboard_data["trends"] = self.calculate_kpi_trends()

        # Generate History (convert NA/NaN to None for JSON compatibility)
        history_df_cleaned = self.actual_metrics_history.replace({pd.NA: None, np.nan: None})
        dashboard_data["history"] = history_df_cleaned.to_dict('records')

        logger.info(f"Generated dashboard data structure for period: {latest_row.get('time_period')}.")
        return dashboard_data

    def persist_history(self, gcs_path: Optional[str] = None):
        """Placeholder: Persists the metrics history DataFrame."""
        # --- TODO: Implement persistence ---
        # Option 1: Save to GCS as CSV/Parquet
        # Option 2: Append/Upsert to a BigQuery table
        if gcs_path:
             try:
                 # Ensure directory exists (if using local path for testing)
                 # os.makedirs(os.path.dirname(gcs_path), exist_ok=True)
                 # self.actual_metrics_history.to_csv(gcs_path, index=False)
                 logger.info(f"Placeholder: Persisted metrics history to {gcs_path}")
             except Exception as e:
                  logger.error(f"Failed to persist history to {gcs_path}: {e}")
        else:
             logger.debug("No persistence path provided for metrics history.")
        # --- End TODO ---

# --- Initialization (Conceptual - within API service or main script) ---
# dashboard: Optional[BusinessImpactDashboard] = None
# if _config_obj and _kg_tool_proxy and _vertex_workflow_client_proxy:
#     try:
#         dashboard = BusinessImpactDashboard(_config_obj, _kg_tool_proxy, _vertex_workflow_client_proxy)
#         # Optional: Load previous history if persisted
#         # dashboard.load_history(...)
#     except Exception as e:
#         logger.critical(f"Dashboard initialization failed: {e}", exc_info=True)
# else:
#     logger.critical("Dashboard initialization failed: Missing critical dependencies (Config, KG Tool, Workflow Client).")

# --- Example Usage (Conceptual - e.g., called by a scheduled job or API endpoint) ---
# async def run_metric_import_and_get_dashboard():
#     if dashboard:
#         current_period = datetime.now(datetime.timezone.utc).strftime("%Y-%m-%d") # Example: daily period
#         success = await dashboard.import_actual_metrics(current_period)
#         if success:
#             dashboard_data = dashboard.generate_dashboard_data()
#             # dashboard.persist_history() # Persist after successful import
#             return dashboard_data
#         else:
#             logger.error(f"Metric import failed for period {current_period}.")
#             return {"error": "Metric import failed"}
#     else:
#         return {"error": "Dashboard not initialized"}

print("\n--- MIZ 3.0 Business Impact Dashboard Logic (Cell 10 - Reworked) ---")
print("Uses real KG Tool API proxy and real Vertex Workflow Executions client proxy.")
print("Includes robust workflow polling and improved error handling.")
print("Handles NA/NaN values for DataFrame operations and JSON output.")
print("--------------------------------------------------------------------")

SyntaxError: invalid syntax (<ipython-input-15-a5b4309c8de0>, line 55)

In [16]:
# Cell 11: Explainable AI (XAI) (Reworked)
# Status: Uses real KG Tool API proxy & FM Client API proxy.
#         Calls KG Tool proxy for structured storage ('kg' mode).
#         Replaced file logging with Cloud Logging recommendation ('log_file' mode).

import logging
import json
import datetime
import asyncio
from typing import Dict, Any, Optional, List, Union, Callable
import os
import re # For potential parsing if needed

# --- Assume necessary components are injected or globally available ---
try:
    # Use CONFIG_OBJ loaded in Cell 1
    if 'CONFIG_OBJ' not in globals() or not CONFIG_OBJ: raise NameError("CONFIG_OBJ not found or is None")
    if not isinstance(CONFIG_OBJ, EnhancedConfig): raise NameError("CONFIG_OBJ is not an EnhancedConfig instance")

    # Use FM Client API Proxy (representing deployed Cell 18 service)
    if 'foundation_model_client' not in globals(): raise NameError("foundation_model_client proxy not found")

    # Use KG Tool API Proxy (representing deployed Cell 3 service)
    if 'kg_tool_service_instance' not in globals(): raise NameError("kg_tool_service_instance proxy not found")

    _config_obj = CONFIG_OBJ
    _fm_client_proxy = foundation_model_client
    _kg_tool_proxy = kg_tool_service_instance
    _moe_manager_proxy = None # Not directly needed for CoT/Counterfactual generation here

    # Import Cloud Logging client library (optional, based on availability)
    try:
        from google.cloud import logging as cloud_logging
        CLOUD_LOGGING_AVAILABLE = True
    except ImportError:
        CLOUD_LOGGING_AVAILABLE = False
        logging.warning("google-cloud-logging library not found. Cloud Logging for XAI unavailable.")

    _real_dependencies = True
    logger.debug("Using real dependencies in Cell 11 (Reworked).")

except NameError as e:
    logger.warning(f"Dependency Error in Cell 11 ({e}). Using Mocks/Placeholders.")
    _real_dependencies = False
    CLOUD_LOGGING_AVAILABLE = False
    # --- Mock/Placeholder Setup ---
    class MockFoundationModelClient: async def generate_text(self, input_data): await asyncio.sleep(0.05); return {"status": "success", "payload": {"generated_text": "Simulated Counterfactual..."}}
    class MockKGTool:
        _decision_logs = {}
        async def save_decision_record(self, request: Dict): record = request.get("payload",{}).get("record",{}); decision_id = record.get('decision_id'); self._decision_logs[decision_id] = record; return {"status": "success"}
        async def retrieve_decision_record(self, request: Dict): decision_id = request.get("payload",{}).get("decision_id"); record = self._decision_logs.get(decision_id); return {"status": "success" if record else "not_found", "payload": {"decision_record": record}}
    # Define minimal config if needed
    if 'CONFIG_OBJ' not in globals() or not CONFIG_OBJ:
        from dataclasses import dataclass, field
        @dataclass class MockFmDefaults: llama4_maverick: str = "mock-mav"; llama4_scout: str = "mock-scout"; feedback_analyzer_model: str = "mock-analyzer"
        @dataclass class MockFmConfig: defaults: MockFmDefaults = field(default_factory=MockFmDefaults)
        @dataclass class MockXaiConfig: storage_type: str = "kg"; log_name: str = "mock_xai_log"; counterfactual_model_alias: str = "llama4_maverick"
        @dataclass class MockEnhancedConfig: foundation_models: MockFmConfig = field(default_factory=MockFmConfig); xai: MockXaiConfig = field(default_factory=MockXaiConfig); miz_oki_schema_version: str = "3.0"; def get_model_info(self, alias): return {"provider": "mock", "model_id": alias, "pricing": {"prompt": 0.1, "completion": 0.2}}; def get(self, key, default=None): parts=key.split('.'); val=self; try: [val := getattr(val, p) for p in parts]; return val; except: return default
        _config_obj = MockEnhancedConfig()
    _fm_client_proxy = MockFoundationModelClient(); _kg_tool_proxy = MockKGTool(); _moe_manager_proxy = None
    # --- End Mock Setup ---

logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')
logger = logging.getLogger('MIZ-OKI.ExplainableAI')

class ExplainableAI:
    """Provides XAI capabilities, storing logs via KG Tool API or Cloud Logging. Deployed as a service."""

    def __init__(self, config: EnhancedConfig, fm_client_proxy: Optional[Any] = None, kg_tool_proxy: Optional[Any] = None):
        if not config: raise InitializationError("Config required for ExplainableAI.")
        self.config = config
        self.fm_client = fm_client_proxy
        self.kg_tool = kg_tool_proxy
        self.xai_storage_type = config.xai.storage_type.lower() # 'kg' or 'log_file'
        self.log_name = config.xai.log_name # Used if storage_type is 'log_file'
        self.counterfactual_model_info = config.get_model_info(config.xai.counterfactual_model_alias)
        self.counterfactual_model_alias = self.counterfactual_model_info.get("model_id") if self.counterfactual_model_info else config.foundation_models.defaults.llama4_maverick # Fallback

        self._cloud_logging_client = None
        self.cloud_logger = None
        if self.xai_storage_type == 'log_file':
            if CLOUD_LOGGING_AVAILABLE:
                try:
                    self._cloud_logging_client = cloud_logging.Client(project=config.gcp.project_id)
                    self.cloud_logger = self._cloud_logging_client.logger(self.log_name)
                    logger.info(f"Using Cloud Logging for XAI storage (Log Name: {self.log_name}).")
                except Exception as log_init_e:
                    logger.error(f"Failed to initialize Cloud Logging client: {log_init_e}. XAI storage via Cloud Logging might fail.", exc_info=True)
                    self._cloud_logging_client = None; self.cloud_logger = None
            else:
                logger.warning("XAI storage set to 'log_file' but Cloud Logging SDK unavailable. XAI storage will be disabled.")
                self.xai_storage_type = 'none' # Disable storage if SDK missing

        # Dependency checks
        if not self.fm_client: logger.warning("XAI initialized without FoundationModelClient proxy. Counterfactual explanations unavailable.")
        if self.xai_storage_type == 'kg' and not self.kg_tool:
            logger.error("XAI storage set to 'kg' but KG Tool proxy is missing. Decision logging/retrieval will fail.")
            self.xai_storage_type = 'none' # Disable storage if dependency missing

        logger.info(f"ExplainableAI initialized (Reworked). Storage: {self.xai_storage_type}, Counterfactual Model: {self.counterfactual_model_alias}")

    def _create_miz_oki_response(self, request_data: Dict, status: str, payload: Optional[Dict] = None, errors: Optional[List[Dict]] = None) -> Dict:
        """Helper to construct a standard MIZ OKI response."""
        return {
            "miz_oki_version": self.config.miz_oki_schema_version,
            "request_id": request_data.get("request_id"), "trace_id": request_data.get("trace_id"),
            "workflow_execution_id": request_data.get("workflow_execution_id"), "step_id": request_data.get("step_id"),
            "timestamp": datetime.now(datetime.timezone.utc).isoformat(),
            "source_component": "ExplainableAI", "target_component": request_data.get("source_component"),
            "status": status, "payload": payload, "error_details": errors, "metadata": {}
        }

    def record_decision(self, input_data: Dict[str, Any]): # Expects MIZ OKI payload
        """
        Records decision details asynchronously based on configured storage type.
        Expects MIZ OKI payload containing the record in `payload.record`.
        This method itself is sync but schedules the async storage task.
        """
        if self.xai_storage_type == 'none':
            logger.debug("XAI storage is disabled. Skipping decision recording.")
            return

        record = input_data.get("payload", {}).get("record", {})
        decision_id = record.get('decision_id')
        if not decision_id:
            logger.error("Cannot record decision: missing 'decision_id' in record payload.")
            return # Don't schedule if ID is missing

        # Add timestamp if missing
        if 'timestamp' not in record:
            record['timestamp'] = datetime.now(datetime.timezone.utc).isoformat()
        # Add trace info from MIZ OKI header
        record['trace_id'] = input_data.get('trace_id')
        record['request_id'] = input_data.get('request_id')

        logger.info(f"Scheduling async recording for decision: {decision_id} from {record.get('component')} (Trace: {record['trace_id']})")
        # Schedule the async storage task to run in the background
        asyncio.create_task(self._store_record_async(record), name=f"store_xai_{decision_id}")

    async def _store_record_async(self, record: Dict):
        """Stores the record asynchronously using KG Tool API proxy or Cloud Logging."""
        decision_id = record['decision_id']
        logger.debug(f"Attempting async storage for decision {decision_id} using {self.xai_storage_type}.")
        try:
            if self.xai_storage_type == 'kg':
                if self.kg_tool and hasattr(self.kg_tool, 'save_decision_record'):
                    # Call KG Tool API proxy with MIZ OKI payload
                    kg_request = {
                        "payload": {"record": record},
                        "trace_id": record.get("trace_id"), # Pass trace ID
                        "request_id": f"kg_save_xai_{decision_id}"
                    }
                    kg_response = await self.kg_tool.save_decision_record(request=kg_request) # Call proxy method
                    if kg_response.get("status") == "success":
                        logger.debug(f"Stored decision {decision_id} via KG Tool API async.")
                    else:
                        logger.error(f"KG Tool API proxy failed to save decision record {decision_id}: {kg_response.get('error_details')}")
                else:
                    logger.error(f"Cannot store decision {decision_id} in KG: KG Tool proxy or method unavailable.")

            elif self.xai_storage_type == 'log_file':
                if self.cloud_logger: # Use Cloud Logging if available
                    try:
                        # log_struct is synchronous, run in thread
                        await asyncio.to_thread(self.cloud_logger.log_struct, record, severity='INFO')
                        logger.debug(f"Stored decision {decision_id} to Cloud Logging.")
                    except Exception as cl_e:
                        logger.error(f"Failed to write XAI log to Cloud Logging: {cl_e}", exc_info=True)
                else:
                    # Fallback to local file (NOT RECOMMENDED for production)
                    # This path should ideally not be hit if Cloud Logging SDK check fails in init
                    logger.critical(f"Cloud Logging unavailable. Cannot store XAI log {decision_id}.")
                    # await asyncio.to_thread(self._append_to_local_log_file, record) # Avoid file I/O in prod

            # else: storage type is 'none' or unsupported, do nothing

        except Exception as e:
            logger.error(f"Failed to store decision record {decision_id} async: {e}", exc_info=True)

    # --- Local file methods removed as Cloud Logging is the preferred non-KG option ---
    # def _append_to_local_log_file(self, record: Dict): ...
    # def _find_in_local_log_file(self, decision_id: str) -> Optional[Dict]: ...

    async def _retrieve_decision_log_async(self, decision_id: str, trace_id: Optional[str] = None) -> Optional[Dict]:
        """Retrieves a decision record async via KG Tool API proxy or Cloud Logging."""
        logger.info(f"Retrieving decision log async for ID: {decision_id}")
        start_time = time.monotonic(); record = None
        try:
            if self.xai_storage_type == 'kg':
                if self.kg_tool and hasattr(self.kg_tool, 'retrieve_decision_record'):
                    # Call KG Tool API proxy
                    kg_request = {
                        "payload": {"decision_id": decision_id},
                        "trace_id": trace_id, "request_id": f"kg_get_xai_{decision_id}"
                    }
                    kg_response = await self.kg_tool.retrieve_decision_record(request=kg_request) # Call proxy method
                    if kg_response.get("status") == "success":
                        record = kg_response.get("payload", {}).get("decision_record")
                    elif kg_response.get("status") != "not_found":
                        logger.error(f"KG Tool API proxy failed to retrieve log {decision_id}: {kg_response.get('error_details')}")
                else:
                    logger.error("Cannot retrieve log from KG: KG Tool proxy or method unavailable.")

            elif self.xai_storage_type == 'log_file':
                if self._cloud_logging_client and CLOUD_LOGGING_AVAILABLE:
                    # --- TODO: Implement Cloud Logging query ---
                    # Requires google-cloud-logging library
                    # This is synchronous and needs careful handling in async context
                    logger.warning("Cloud Logging retrieval for XAI not implemented. Requires sync-to-async handling.")
                    # Example sync logic (needs to be run in thread):
                    # filter_str = f'jsonPayload.decision_id="{decision_id}"'
                    # try:
                    #     iterator = self._cloud_logging_client.list_entries(filter_=filter_str, order_by=cloud_logging.DESCENDING, max_results=1)
                    #     entries = list(iterator) # Blocking call
                    #     if entries: record = entries[0].payload
                    # except Exception as cl_e: logger.error(f"Error querying Cloud Logging for {decision_id}: {cl_e}")
                    # --- End TODO ---
                else:
                     logger.error("Cannot retrieve log: Cloud Logging client unavailable.")

            # else: storage type is 'none' or unsupported

        except Exception as e:
            logger.error(f"Error retrieving decision log {decision_id} async: {e}", exc_info=True)
            return None

        duration = (time.monotonic() - start_time) * 1000
        if record: logger.info(f"Retrieved decision log {decision_id} async in {duration:.2f} ms.")
        else: logger.warning(f"Decision log {decision_id} not found (storage: {self.xai_storage_type}). Retrieval took {duration:.2f} ms.")
        return record

    async def explain_decision(self, input_data: Dict[str, Any]) -> Dict[str, Any]:
        """Generates explanation async. Expects/Returns MIZ OKI payload."""
        start_time = time.monotonic(); errors = []
        # Parse MIZ OKI input
        payload = input_data.get("payload", {}); decision_id = payload.get("decision_id"); method = payload.get("method", 'chain_of_thought')
        trace_id = input_data.get("trace_id"); request_id = input_data.get("request_id")

        if not decision_id: errors.append({"code": "MISSING_ID", "message": "'payload.decision_id' is required."})
        if errors:
            response = self._create_miz_oki_response(input_data, "bad_request", errors=errors)
            response["metadata"]["processing_duration_ms"] = (time.monotonic() - start_time) * 1000
            return response

        logger.info(f"Generating explanation async for decision {decision_id} using method: {method}")
        status = "pending"; response_payload = None
        explanation = f"Explanation failed: Method '{method}' processing error." # Default error

        decision_log = await self._retrieve_decision_log_async(decision_id, trace_id=trace_id)
        if not decision_log:
            errors.append({"code": "LOG_NOT_FOUND", "message": f"Decision log not found for ID: {decision_id}."})
            status = "not_found"
            response = self._create_miz_oki_response(input_data, status, response_payload, errors)
            response["metadata"]["processing_duration_ms"] = (time.monotonic() - start_time) * 1000
            return response

        try:
            if method == 'chain_of_thought':
                cot = decision_log.get('chain_of_thought')
                if cot and isinstance(cot, list):
                    # Format CoT nicely
                    explanation_parts = [f"Explanation ({method}) for Decision ID: {decision_id}"]
                    explanation_parts.extend([f"- {step}" for step in cot])
                    explanation = "\n".join(explanation_parts)
                    status = "success"
                else:
                    errors.append({"code": "NO_COT", "message": "No valid Chain of Thought recorded for this decision."}); status = "failed"

            elif method == 'counterfactual':
                 if not self.fm_client: errors.append({"code": "MISSING_DEPENDENCY", "message": "FoundationModelClient proxy unavailable for counterfactuals."}); status = "config_error"
                 elif not self.counterfactual_model_alias: errors.append({"code": "CONFIG_ERROR", "message": "Counterfactual model alias not configured."}); status = "config_error"
                 else:
                     logger.info(f"Generating counterfactual explanation async via FM Client API proxy ({self.counterfactual_model_alias})...")
                     # --- Prepare Prompt ---
                     # Include key inputs, the decision made, and potentially the outcome if available
                     prompt_context = f"Decision Context:\n{json.dumps(decision_log.get('context',{}), indent=2, default=str)}\n"
                     prompt_inputs = f"Decision Inputs:\n{json.dumps(decision_log.get('inputs',{}), indent=2, default=str)}\n"
                     prompt_decision = f"Decision Made:\n{json.dumps(decision_log.get('decision',{}), indent=2, default=str)}\n"
                     prompt_task = "Task: Generate a concise counterfactual explanation. What minimal changes to the inputs or context would have led to a different decision? Explain why."
                     prompt = f"{prompt_context}\n{prompt_inputs}\n{prompt_decision}\n{prompt_task}"
                     # --- End Prepare Prompt ---
                     try:
                         # Call FM Client API proxy
                         fm_request = {
                             "payload": {"prompt": prompt, "model_alias": self.counterfactual_model_alias, "max_tokens": 300, "temperature": 0.4},
                             "trace_id": trace_id, "request_id": f"fm_cf_{request_id}"
                         }
                         fm_response = await self.fm_client.generate_text(input_data=fm_request) # Call API proxy

                         if fm_response.get("status") == "success":
                             generated_text = fm_response.get("payload",{}).get("generated_text")
                             explanation = f"Counterfactual Explanation for {decision_id}:\n{generated_text.strip()}"
                             status = "success"
                         else:
                             raise RuntimeError(f"FM Client API proxy failed: {fm_response.get('error_details')}")
                     except Exception as fm_e:
                         errors.append({"code": "COUNTERFACTUAL_ERROR", "message": str(fm_e)}); logger.error(f"Async counterfactual generation via FM Client API proxy failed: {fm_e}", exc_info=True); status = "failed"

            elif method in ['shap', 'lime']:
                 # These typically require model access and data, hard to do post-hoc from logs alone
                 warning_msg = "SHAP/LIME methods require access to model state and data at decision time, often impractical post-hoc in distributed systems. Consider pre-computed feature importances stored in the log."; errors.append({"code": "UNSUPPORTED_METHOD", "message": warning_msg}); logger.warning(warning_msg); status = "failed"
            else:
                 errors.append({"code": "UNSUPPORTED_METHOD", "message": f"Unsupported explanation method '{method}'."}); status = "bad_request"

            response_payload = {"explanation": explanation} if status == "success" else {"explanation": None}

        except Exception as e:
             status = "internal_error"; errors.append({"code": "EXPLAIN_ERROR", "message": str(e)}); logger.exception(f"Error during async explanation for {decision_id}: {e}")

        response = self._create_miz_oki_response(input_data, status, response_payload, errors if errors else None)
        response["metadata"]["processing_duration_ms"] = (time.monotonic() - start_time) * 1000
        return response

    async def provide_role_based_explanation(self, input_data: Dict[str, Any]) -> Dict[str, Any]:
        """Generates tailored explanation async via FM Client API proxy. Expects/Returns MIZ OKI."""
        start_time = time.monotonic(); errors = []
        # Parse MIZ OKI input
        payload = input_data.get("payload", {}); decision_id = payload.get("decision_id"); role = payload.get("role"); method = payload.get("method", 'chain_of_thought')
        trace_id = input_data.get("trace_id"); request_id = input_data.get("request_id")

        if not role: errors.append({"code": "MISSING_ROLE", "message": "'payload.role' is required for tailoring."})
        if errors:
            response = self._create_miz_oki_response(input_data, "bad_request", errors=errors)
            response["metadata"]["processing_duration_ms"] = (time.monotonic() - start_time) * 1000
            return response

        status = "pending"; response_payload = None; tailored_explanation = None

        # 1. Get base explanation first
        base_explain_request = {
            "payload": {"decision_id": decision_id, "method": method},
            "trace_id": trace_id, "request_id": f"xai_base_{request_id}" # Link requests
        }
        base_response = await self.explain_decision(base_explain_request)
        base_explanation = base_response.get("payload", {}).get("explanation")

        if base_response.get("status") != "success" or not base_explanation:
            # Failed to get base explanation, return the error
            status = base_response.get("status", "failed")
            errors = base_response.get("error_details", [{"code": "BASE_EXPLAIN_FAILED", "message": "Failed to get base explanation."}])
            response = self._create_miz_oki_response(input_data, status, response_payload, errors)
            response["metadata"]["processing_duration_ms"] = (time.monotonic() - start_time) * 1000
            return response

        # 2. Tailor the explanation using FM Client API proxy
        if not self.fm_client:
            # Return base explanation with warning if FM client unavailable
            status = "success"; response_payload = {"explanation": f"{base_explanation}\n\n(Tailoring failed: FM Client unavailable)"}
            response = self._create_miz_oki_response(input_data, status, response_payload, errors if errors else None)
            response["metadata"]["processing_duration_ms"] = (time.monotonic() - start_time) * 1000
            return response

        logger.info(f"Tailoring explanation async for role: {role} using FM Client API proxy...")
        # Use a model suitable for instruction following/summarization
        tailoring_model_info = self.config.get_model_info(self.config.foundation_models.defaults.feedback_analyzer_model) # Reuse feedback model alias
        tailoring_model_alias = tailoring_model_info.get("model_id") if tailoring_model_info else self.config.foundation_models.defaults.llama4_scout

        prompt = f"""Please tailor the following explanation for someone in the '{role}' role. Focus on the aspects most relevant to them and use appropriate language. Keep it concise.

Original Explanation:
---
{base_explanation}
---

Tailored Explanation for '{role}':"""

        try:
            # Call FM Client API proxy
            fm_request = {
                "payload": {"prompt": prompt, "model_alias": tailoring_model_alias, "max_tokens": 1024, "temperature": 0.3},
                "trace_id": trace_id, "request_id": f"fm_tailor_{request_id}"
            }
            fm_response = await self.fm_client.generate_text(input_data=fm_request) # Call API proxy

            if fm_response.get("status") == "success":
                generated_text = fm_response.get("payload",{}).get("generated_text")
                tailored_explanation = generated_text.strip() if generated_text else f"{base_explanation}\n\n(Tailoring failed: Generation returned empty result)"
                status = "success"
            else:
                raise RuntimeError(f"FM Client API proxy failed for tailoring: {fm_response.get('error_details')}")

            response_payload = {"explanation": tailored_explanation}

        except Exception as e:
             status = "partial_success"; errors.append({"code": "TAILORING_ERROR", "message": str(e)})
             logger.error(f"Async tailoring via FM client API proxy failed: {e}", exc_info=True)
             response_payload = {"explanation": f"{base_explanation}\n\n(Tailoring failed: {e})"} # Return base with error

        response = self._create_miz_oki_response(input_data, status, response_payload, errors if errors else None)
        response["metadata"]["processing_duration_ms"] = (time.monotonic() - start_time) * 1000
        return response

# --- Initialization (Conceptual - Service instantiated by framework/orchestrator) ---
# xai: Optional[ExplainableAI] = None
# async def initialize_xai_service():
#     global xai
#     if not _config_obj or not _real_dependencies:
#         logger.critical("Cannot initialize XAI Service: Config or dependencies missing.")
#         return
#     try:
#         xai = ExplainableAI(config=_config_obj, fm_client_proxy=_fm_client_proxy, kg_tool_proxy=_kg_tool_proxy)
#         logger.info("ExplainableAI Service initialized.")
#     except Exception as e:
#         logger.critical(f"XAI Service initialization failed: {e}", exc_info=True)
#         xai = None

print("\n--- MIZ 3.0 Explainable AI (XAI) (Reworked) ---")
print("Uses real KG Tool API proxy & FM Client API proxy.")
print("KG storage uses structured nodes/rels. Cloud Logging is preferred non-KG storage.")
print("Handles MIZ OKI payloads for API interaction.")
print("-------------------------------------------------")

SyntaxError: invalid syntax (<ipython-input-16-3dc2ca43b83c>, line 47)

In [17]:
# Cell 15: MoA Architecture - Vertex AI / ADK Implementation Architecture (Reworked)
# Status: Finalized Descriptive Cell. Clarified component mappings further. Emphasized deployment strategy.
# Reasoning: Provides a clear, concise architectural overview guiding the implementation across other cells,
#            reflecting the finalized shift to managed GCP services.

import logging

# Ensure logger is configured (ideally inherited from Cell 1)
logger = logging.getLogger('MIZ-OKI.VertexMoAArchitecture')
if not logger.hasHandlers():
    logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')

# --- Core Architectural Shift Description ---
logger.info("--- MIZ OKI 3.0 ARCHITECTURE - CELL 15: Vertex AI / ADK Implementation ---")
logger.info("This architecture replaces previous custom MoA implementations based on queues/persistence managers.")
logger.info("It leverages Google Cloud's Vertex AI platform for scalable, manageable agentic workflow orchestration.")
logger.info("Core Pillars: Vertex AI Workflows (Orchestration), Cloud Run/Functions (Tool/Agent Runtime), ADK/LangGraph (Agent Logic), Deployed Tools (Capabilities).")

# --- Mapping of Deprecated Concepts to Current Architecture ---
logger.info("\nMapping Deprecated Concepts -> Current Vertex AI / GCP Concepts:")
logger.info("  - MIZ_MoA_System / Orchestrator: Replaced by Vertex AI Workflows definitions + Execution Service.")
logger.info("  - UnifiedCommunicationSystem: Replaced by native input/output passing & state management within Vertex AI Workflows. Event-driven triggers via Pub/Sub.")
logger.info("  - RobustTaskQueue / TaskPersistenceManager: Replaced by Vertex AI Workflow Execution state persistence, automatic retries, logging, and tracing.")
logger.info("  - EnhancedBaseAgent / Agent Class: Represents the *logic* implemented within ADK/LangGraph structures, deployed as callable services (Cloud Run/Functions).")
logger.info("  - AgentFactory: Replaced by deploying agent/tool services (e.g., Cloud Run, Cloud Functions) and referencing their invocation URLs within Vertex AI Workflow definitions.")
logger.info("  - REWOOSystem / BossAgent Planning: Planning logic encapsulated within a dedicated ADK 'PlannerAgent' service or integrated into the API layer (Cell 16) triggering workflows. The agent/API uses the Vertex AI API (`executions_v1.ExecutionsClient`) to start workflow executions.")

# --- Key Vertex AI / GCP Components Utilized ---
logger.info("\nLeveraged GCP / Vertex AI Components:")
logger.info("  1. Vertex AI Workflows (`workflows_v1`, `executions_v1`): Defines the DAG for complex processes (KU->DM->LI->PO, Business Apps). Handles step sequencing, parallelism, conditionals, error handling, human-in-the-loop callbacks.")
logger.info("     -> Managed execution service provides state persistence, retries, logging, tracing.")
logger.info("  2. Cloud Run / Cloud Functions: Recommended runtime for deploying individual Tools and Agents (built with ADK/LangGraph or standard Python) as scalable, stateless (or stateful with external DB) microservices.")
logger.info("     -> Workflow steps invoke these services via authenticated HTTP calls (invoked via Workflow's built-in HTTP connector or custom connectors).")
logger.info("  3. ADK (Agent Development Kit) / LangGraph: Frameworks for building the core logic of individual agents/tools (Python classes/functions). Focuses on agent reasoning, state (often externalized), and tool usage.")
logger.info("  4. Deployed Tools/Services (This Codebase): The Python classes (e.g., `KnowledgeGraphToolService`, `SemanticGraphRAGTool`, `BEABTool`) represent the logic deployed within Cloud Run/Functions services.")
logger.info("  5. Vertex AI Agent Builder (Console/APIs): Used to manage Tools (registering function specs/OpenAPI specs of deployed services), connect Data sources (Vertex AI Search for grounding), and potentially build simpler conversational agents (less relevant for this backend architecture).")
logger.info("  6. Google Cloud Pub/Sub: Used for event-driven workflow triggers (e.g., MLOps completion, external events like new data arrival, internal signals like LI actions, human approvals).")
logger.info("  7. Google Secret Manager: Securely stores API keys, database credentials, MIZ_SALT, etc., accessed by services/workflows.")
logger.info("  8. Google Cloud Logging & Tracing: Provide observability into Workflow executions and Tool/Agent service calls. MIZ OKI trace IDs facilitate end-to-end tracing.")
logger.info("  9. IAM & Service Accounts: Securely manage permissions for workflows calling tools and tools accessing GCP services.")
logger.info(" 10. Vertex AI Pipelines (KFP): Used for orchestrating ML training/evaluation/deployment workflows (Cell 17), triggered via Pub/Sub.")
logger.info(" 11. Other GCP Services: Cloud Storage, BigQuery, Neo4j (AuraDB/Managed), Vertex AI Vector Search as needed by specific tools.")

# --- Implementation Strategy Summary ---
logger.info("\nImplementation Approach:")
logger.info("  - Define business processes and agentic sequences as Vertex AI Workflows (YAML).")
logger.info("  - Implement reusable capabilities (KG access, FM calls, Causal AI, Business Logic, External API interactions) as async Python functions/classes within Tool/Service modules (like the reworked Cells 2-8, 10-11, 16, 18).")
logger.info("  - Deploy these Tools/Services using Cloud Run (preferred for flexibility) or Cloud Functions, ensuring they handle MIZ OKI JSON payloads via HTTP POST.")
logger.info("  - Secure deployed services using IAM invoker roles and authentication within the service code (e.g., verifying OIDC tokens from Workflow calls).")
logger.info("  - Configure Workflow steps to make authenticated HTTP calls to the deployed Tool/Agent services, passing MIZ OKI payloads as input and receiving them as output.")
logger.info("  - Use Vertex AI Workflow variables and state passing (`assign`, `result`) for context transfer between steps.")
logger.info("  - Trigger workflows via the Human Interface API (Cell 16 making calls to `executions_v1.create_execution`), Pub/Sub messages, or Cloud Scheduler.")
logger.info("  - Implement MLOps using Vertex AI Pipelines (Cell 17), triggered via Pub/Sub messages published by relevant tools (e.g., AKA/B.O.S.S., DRL Manager).")
logger.info("  - Utilize GCP's native logging, tracing, and monitoring, ensuring MIZ OKI trace IDs are propagated.")

# --- Conclusion ---
logger.info("\nConclusion: This Vertex AI-native architecture provides a robust, scalable, and maintainable foundation for MIZ OKI 3.0, focusing development effort on workflow definition and the business/AI logic within deployable Tools and Agents, leveraging managed GCP services for orchestration, execution, and observability.")

print("\n--- MIZ 3.0 MoA Architecture Definition Finalized (Cell 15 - Vertex AI Integrated) ---")
# No executable code needed in this cell. It serves as documentation for the architecture.


--- MIZ 3.0 MoA Architecture Definition Finalized (Cell 15 - Vertex AI Integrated) ---


In [18]:
# Cell 16: Human-Agent Collaboration Interface (API/Spec - AW Pillar) (Reworked)
# Status: Uses REAL google-cloud-workflows client proxy (if available) for workflow interactions.
#         Calls backend tools via injected proxies (conceptual MIZ OKI APIs).
#         Error handling improved. Still requires deployment in API framework & real backend tools.

import logging
import datetime
import json
import uuid
import asyncio
from typing import Dict, Any, Optional, List, Union, Callable
import time
import random

# --- GCP Client Libraries & Dependencies ---
# Needed for interacting with Vertex AI Workflow Executions and Pub/Sub
try:
   from google.cloud import workflows_v1 # Not directly used here, but executions client depends on it
   from google.cloud.workflows import executions_v1
   from google.cloud.workflows.executions_v1.types import Execution
   from google.protobuf import json_format
   from google.api_core import exceptions as google_api_exceptions
   from google.cloud import pubsub_v1
   VERTEX_WORKFLOWS_SDK_AVAILABLE = True
   PUBSUB_SDK_AVAILABLE = True
   logger.debug("Successfully imported google-cloud-workflows/pubsub libraries for Cell 16.")
except ImportError:
   logger.warning("google-cloud-workflows or google-cloud-pubsub library not found. API interactions will be mocked or limited.")
   VERTEX_WORKFLOWS_SDK_AVAILABLE = False
   PUBSUB_SDK_AVAILABLE = False
   # Dummy classes from Cell 16 reformation (or Cell 9)
   class ExecutionState: ACTIVE="ACTIVE"; SUCCEEDED="SUCCEEDED"; FAILED="FAILED"; CANCELLED="CANCELLED"; SUSPENDED="SUSPENDED"
   class DummyProto: pass
   class executions_v1: ExecutionState = ExecutionState; class ExecutionsAsyncClient: pass; class Execution(DummyProto): pass; class CreateExecutionRequest: pass; class GetExecutionRequest: pass; class CancelExecutionRequest: pass; class ListExecutionsRequest: pass; class ListExecutionsResponse: pass; ExecutionView = type('Enum', (), {'BASIC': 1, 'FULL': 2})()
   class json_format: @staticmethod def MessageToDict(msg, **kwargs): return getattr(msg, '_fields', {})
   class google_api_exceptions: class NotFound(Exception): pass; class InvalidArgument(Exception): pass; class PermissionDenied(Exception): pass; class FailedPrecondition(Exception):pass; class GoogleAPIError(Exception): pass
   class pubsub_v1: class PublisherClient: pass # Dummy client

# --- Assume Real Tool/Client Dependencies are Injected/Available ---
# Proxies represent API clients for other deployed MIZ OKI services.
# GCP clients represent actual SDK clients or mocks.
try:
    # Use CONFIG_OBJ loaded in Cell 1
    if 'CONFIG_OBJ' not in globals() or not CONFIG_OBJ: raise NameError("CONFIG_OBJ not found or is None")
    if not isinstance(CONFIG_OBJ, EnhancedConfig): raise NameError("CONFIG_OBJ is not an EnhancedConfig instance")

    # Proxies for MIZ OKI Tools
    if 'hde_tool' not in globals(): raise NameError("hde_tool proxy not found") # Cell 5 Proxy
    if 'agg_tool' not in globals(): raise NameError("agg_tool proxy not found") # Cell 5 Proxy (Goal Generator)
    if 'scf_tool' not in globals(): raise NameError("scf_tool proxy not found") # Cell 5 Proxy (Feedback/Validator source)
    if 'xai' not in globals(): raise NameError("xai proxy not found") # Cell 11 Proxy

    # Real/Mock GCP Clients
    if '_workflow_executions_client' not in globals(): raise NameError("_workflow_executions_client not found") # Cell 16 needs this
    if '_pubsub_client' not in globals(): raise NameError("_pubsub_client not found") # Cell 16 needs this

    _config_obj = CONFIG_OBJ
    _hde_tool_proxy = hde_tool
    _agg_tool_proxy = agg_tool
    _validator_tool_proxy = scf_tool # Renaming for clarity in this context
    _xai_tool_proxy = xai
    _exec_client_proxy = _workflow_executions_client # Use real/mock client proxy
    _pubsub_client_proxy = _pubsub_client # Use real/mock client proxy
    _real_backend_tools = True
    logger.debug("Using real/conceptual backend tool proxies and clients in Cell 16 (Reworked).")

except NameError as e:
   logger.warning(f"Dependency Error ({e}). Using Placeholders for backend tools in Cell 16 (Reworked).")
   _real_backend_tools = False
   # --- Mock/Placeholder Setup ---
   class PlaceholderHDE: async def get_history(self, request): return {"status": "success", "payload": {"history": []}}; async def update_decision_log(self, request): return {"status": "success"}
   class PlaceholderAGG: async def get_active_goals(self, request): return {"status": "success", "payload": {"goals": []}}; async def add_goal(self, request): return {"status": "success", "payload": {"goal_id": f"goal_{uuid.uuid4().hex[:6]}"}}
   class PlaceholderCV: async def add_feedback(self, input_data): return {"status": "success"}
   class PlaceholderXAI: async def explain_decision(self, input_data): return {"status": "success", "payload": {"explanation": "Placeholder Explanation"}}; async def provide_role_based_explanation(self, input_data): return {"status": "success", "payload": {"explanation": "Placeholder Role Explanation"}}
   _hde_tool_proxy = PlaceholderHDE(); _agg_tool_proxy = PlaceholderAGG(); _validator_tool_proxy = PlaceholderCV(); _xai_tool_proxy = PlaceholderXAI()
   # Define minimal config if needed
   if 'CONFIG_OBJ' not in globals() or not CONFIG_OBJ:
        from dataclasses import dataclass, field
        @dataclass class MockGcpConfig: project_id: str = "mock-project"; region: str = "mock-region"
        @dataclass class MockSysThresholds: human_review_confidence_threshold: float = 0.75
        @dataclass class MockConfig: gcp: MockGcpConfig = field(default_factory=MockGcpConfig); system_thresholds: MockSysThresholds = field(default_factory=MockSysThresholds); miz_oki_schema_version: str = "3.0"; def get(self, key, default=None): return getattr(self, key, default)
        _config_obj = MockConfig()
   # Use Mock Client from Cell 9 reformation if real one not available
   if '_workflow_executions_client' not in globals():
        class MockVertexExecClient:
            _executions = {}; State = ExecutionState
            async def create_execution(self, request): exec_id_suffix = uuid.uuid4().hex[:12]; exec_name = f"{request.parent}/executions/{exec_id_suffix}"; self._executions[exec_name] = {"name": exec_name, "state": self.State.ACTIVE, "start_time": datetime.now(datetime.timezone.utc)}; if random.random() < 0.1: self._executions[exec_name]["state"] = self.State.SUSPENDED; return MagicMock(name=exec_name, state=self._executions[exec_name]["state"], start_time=self._executions[exec_name]["start_time"])
            async def get_execution(self, request): exec_data = self._executions.get(request.name); if not exec_data: raise google_api_exceptions.NotFound(); if exec_data["state"] == self.State.ACTIVE and random.random() < 0.1: exec_data["state"] = self.State.SUCCEEDED; return MagicMock(**exec_data)
            async def cancel_execution(self, request): exec_name = request.name; if exec_name in self._executions and self._executions[exec_name]["state"] in [self.State.ACTIVE, self.State.SUSPENDED]: self._executions[exec_name]["state"] = self.State.CANCELLED; return MagicMock(**self._executions[exec_name]); raise google_api_exceptions.FailedPrecondition()
            async def list_executions(self, request): results = []; parent_prefix = request.parent + "/executions/"; # ... (mock list logic) ...; return MagicMock(executions=results)
        _exec_client_proxy = MockVertexExecClient()
        logger.warning("Using Mock Vertex Executions Client for Human API (Reworked).")
   if '_pubsub_client' not in globals(): class MockPubSubClient: async def publish(self, topic, data): return f"msg_{uuid.uuid4().hex[:8]}"
   _pubsub_client_proxy = _pubsub_client or MockPubSubClient()
   # --- End Mock Setup ---

logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')
logger = logging.getLogger('MIZ-OKI.HumanAgentInterfaceAPI')

class HumanAgentInterfaceAPI:
   """ Backend logic for Human-Agent Collaboration API. Uses real Vertex client proxy if available. """
   def __init__(self, decision_engine_proxy: Any, goal_generator_proxy: Any, validator_proxy: Any, xai_proxy: Any,
                executions_client_proxy: Any, pubsub_client_proxy: Any, config: EnhancedConfig):
       if not all([decision_engine_proxy, goal_generator_proxy, validator_proxy, xai_proxy, executions_client_proxy, pubsub_client_proxy, config]):
            raise InitializationError("HumanAgentInterfaceAPI initialized with MISSING dependencies!")
       self.hde_tool = decision_engine_proxy
       self.agg_tool = goal_generator_proxy
       self.validator_tool = validator_proxy # For submitting feedback
       self.xai_tool = xai_proxy
       self.exec_client = executions_client_proxy # Use the injected client proxy
       self.pubsub_client = pubsub_client_proxy # Use the injected client proxy
       self.config = config
       self.project = config.gcp.project_id
       self.location = config.gcp.region
       self.logger = logging.getLogger('MIZ-OKI.HumanAgentInterfaceAPI')
       self.logger.info("Human-Agent Interface API logic initialized (Reworked).")

   async def _check_permissions(self, user_id: str, action: str, resource_id: Optional[str] = None) -> bool:
       """ Placeholder for permission checking logic. """
       # --- TODO: Implement actual permission check ---
       # - Integrate with IAM, OAuth scopes, or an internal role-based access control (RBAC) system.
       # - Check if user_id has permission to perform 'action' on 'resource_id'.
       # - Example: Check if user is in 'MIZ_APPROVERS' group for 'approve_action'.
       # --- End TODO ---
       self.logger.debug(f"PERMISSION CHECK (Placeholder): User '{user_id}', Action '{action}', Resource '{resource_id}'. Allowing.")
       await asyncio.sleep(0.001) # Simulate check latency
       return True # Default to allow for now

   async def _get_parent_path(self) -> str:
       """Constructs the parent path for workflow resources."""
       if not self.project or not self.location: raise ValueError("GCP Project ID/Location missing in config.")
       return f"projects/{self.project}/locations/{self.location}"

   def _create_miz_oki_response(self, status: str, payload: Optional[Dict] = None, errors: Optional[List[Dict]] = None, request_data: Optional[Dict]=None) -> Dict:
        """Helper to construct a standard MIZ OKI response for the API layer."""
        # Note: This API layer itself might not receive a full MIZ OKI request if called via HTTP,
        # but it should return one. We simulate having request context if needed.
        req_data = request_data or {}
        return {
            "miz_oki_version": self.config.miz_oki_schema_version,
            "request_id": req_data.get("request_id", f"api_resp_{uuid.uuid4().hex[:8]}"), # Generate if not passed
            "trace_id": req_data.get("trace_id"),
            "timestamp": datetime.now(datetime.timezone.utc).isoformat(),
            "source_component": "HumanAgentInterfaceAPI",
            "target_component": req_data.get("source_component"), # Echo back caller if known
            "status": status,
            "payload": payload,
            "error_details": errors,
            "metadata": {}
        }

   # --- Decision/Workflow Review & Approval ---
   async def get_pending_reviews(self, user_id: str, limit: int = 20) -> Dict[str, Any]:
       """API Method: Fetches items needing human review (paused workflows, low-confidence decisions)."""
       self.logger.info(f"API: get_pending_reviews async for user '{user_id}' (limit: {limit})")
       start_time = time.monotonic()
       if not await self._check_permissions(user_id, "view_reviews"):
           return self._create_miz_oki_response(None, "permission_denied", errors=[{"message": "Permission denied"}])

       # Fetch from different sources concurrently
       fetch_tasks = [
           self._fetch_hde_reviews(limit, user_id), # Pass user_id if HDE needs it for filtering
           self._fetch_paused_vertex_workflows(limit, user_id) # Pass user_id if filtering needed
       ]
       results_or_exceptions = await asyncio.gather(*fetch_tasks, return_exceptions=True)

       combined_pending = []
       errors = []
       for i, result in enumerate(results_or_exceptions):
           source = "HDE" if i == 0 else "Workflows"
           if isinstance(result, Exception):
               error_msg = f"Error fetching reviews from {source}: {result}"
               self.logger.error(error_msg, exc_info=True)
               errors.append({"code": f"FETCH_{source}_ERROR", "message": error_msg})
           elif isinstance(result, list):
               combined_pending.extend(result)

       # Sort by timestamp (descending) and limit
       try:
           # Ensure timestamp is present and valid for sorting
           def get_sort_key(item):
               ts_str = item.get('timestamp')
               if ts_str:
                   try: return datetime.fromisoformat(ts_str.replace('Z', '+00:00'))
                   except ValueError: return datetime.min.replace(tzinfo=datetime.timezone.utc) # Put items with bad timestamps first/last
               return datetime.min.replace(tzinfo=datetime.timezone.utc) # Items without timestamp first/last

           combined_pending.sort(key=get_sort_key, reverse=True)
       except Exception as sort_e:
           self.logger.warning(f"Could not sort pending reviews: {sort_e}")
           errors.append({"code": "SORT_ERROR", "message": f"Sorting failed: {sort_e}"})

       final_list = combined_pending[:limit]
       status = "success" if not errors else "partial_success"
       response = self._create_miz_oki_response(None, status, {"pending_reviews": final_list}, errors if errors else None)
       response["metadata"]["processing_duration_ms"] = (time.monotonic() - start_time) * 1000
       return response

   async def _fetch_hde_reviews(self, limit: int, user_id: str) -> List[Dict]:
       """Fetches low-confidence decisions from HDE Tool API proxy."""
       pending = []; count = 0
       if not self.hde_tool or not hasattr(self.hde_tool, 'get_history'):
           self.logger.warning("HDE Tool proxy unavailable or get_history method missing.")
           return pending
       try:
           # Call HDE Tool API proxy - request more than limit to allow filtering
           hde_request = {"payload": {"limit": limit * 2, "status_filter": "pending_review"}} # Example filter
           hde_response = await self.hde_tool.get_history(request=hde_request) # Call API proxy

           if hde_response.get("status") == "success":
               history = hde_response.get("payload", {}).get("history", [])
               threshold = self.config.system_thresholds.human_review_confidence_threshold
               for decision in history:
                    if count >= limit: break
                    if isinstance(decision, dict) and not decision.get('human_review_status'): # Check if not already reviewed
                         confidence = decision.get('final_confidence', 1.0)
                         ethics_flagged = isinstance(decision.get('ethics_flag'), dict) # Check if ethics flag exists
                         reason = ""
                         if confidence < threshold: reason += f"Confidence ({confidence:.2f} < {threshold:.2f})"
                         if ethics_flagged: reason += (" and " if reason else "") + "Ethics Flagged"

                         if reason: # Only add if review is needed
                             pending.append({
                                 "review_id": decision.get("decision_id"),
                                 "type": "decision",
                                 "summary": f"Review decision '{decision.get('decision_type', 'N/A')}' ({reason})",
                                 "timestamp": decision.get('timestamp_start'),
                                 "details_link": f"/api/decisions/{decision.get('decision_id')}" # Conceptual link
                             })
                             count += 1
           else:
               logger.error(f"HDE Tool API get_history failed: {hde_response.get('error_details')}")
       except Exception as e:
           self.logger.error(f"Error fetching HDE reviews via API proxy: {e}", exc_info=True)
       return pending

   async def _fetch_paused_vertex_workflows(self, limit: int, user_id: str) -> List[Dict]:
       """ Fetches paused workflows using the REAL Vertex AI Executions Client proxy. """
       paused_workflows = []
       if not self.exec_client or not VERTEX_WORKFLOWS_SDK_AVAILABLE:
           self.logger.warning("Vertex Executions client proxy/SDK unavailable. Cannot fetch paused workflows.")
           return paused_workflows
       try:
           parent = await self._get_parent_path()
           # Filter for SUSPENDED state
           filter_string = f'state = "{ExecutionState.SUSPENDED.name}"'
           # --- TODO: Add filtering based on user_id if workflows have labels/metadata indicating assignee ---
           # Example: filter_string += f' AND labels.assigned_user = "{user_id}"'
           # --- End TODO ---
           request = executions_v1.ListExecutionsRequest(
               parent=parent,
               view=executions_v1.ExecutionView.BASIC, # BASIC view is usually enough
               filter=filter_string,
               page_size=limit # Limit results from the API
           )
           count = 0
           # Use the async iterator provided by the client proxy
           async for execution in await self.exec_client.list_executions(request=request):
                if count >= limit: break
                # --- TODO: Add check for specific callback/human step label if possible ---
                # This might involve getting FULL view or checking execution args/state
                # --- End TODO ---
                exec_name_full = execution.name
                # Extract short workflow ID and execution ID
                parts = exec_name_full.split('/')
                wf_id_short = parts[5] if len(parts) > 5 else "unknown_wf"
                exec_id_short = parts[-1]

                paused_workflows.append({
                    "review_id": exec_name_full, # Use full name as the unique ID
                    "type": "workflow",
                    "summary": f"Approve workflow '{wf_id_short}' (Execution: ...{exec_id_short[-6:]})",
                    "timestamp": execution.start_time.isoformat() if execution.start_time else None,
                    "details_link": f"/api/workflows/executions/{exec_id_short}" # Conceptual link
                })
                count += 1
           self.logger.info(f"Fetched {len(paused_workflows)} SUSPENDED Vertex AI workflows via client proxy.")
       except google_api_exceptions.GoogleAPIError as api_e:
           self.logger.error(f"API Error fetching paused Vertex workflows via client proxy: {api_e}")
       except Exception as e:
           self.logger.error(f"Unexpected Error fetching paused Vertex workflows via client proxy: {e}", exc_info=True)
       return paused_workflows

   async def approve_action(self, user_id: str, review_id: str, approval_data: Optional[Dict] = None, comments: Optional[str] = None) -> Dict[str, Any]:
       """ API Method: Approves a pending workflow (via PubSub signal) or decision (via HDE API proxy). """
       self.logger.info(f"API: approve_action async for '{review_id}' by '{user_id}'")
       start_time = time.monotonic()
       if not await self._check_permissions(user_id, "approve_action", review_id):
           return self._create_miz_oki_response(None, "permission_denied", errors=[{"message": "Permission denied"}])

       is_workflow_exec_id = review_id.startswith("projects/") and "/workflows/" in review_id and "/executions/" in review_id
       status = "error"; message = "Approval failed: Item not found or mechanism error."; errors = []

       # --- Try Workflow Approval via Pub/Sub Signal ---
       if is_workflow_exec_id:
            if self.pubsub_client and PUBSUB_SDK_AVAILABLE:
                try:
                    # --- TODO: Get approval topic name from config ---
                    approval_signal_topic_name = "workflow-approvals" # Example
                    approval_signal_topic_path = f"projects/{self.project}/topics/{approval_signal_topic_name}"
                    # --- End TODO ---
                    signal_data = {
                        "execution_id": review_id,
                        "approved": True,
                        "approval_data": approval_data or {}, # Data needed by the workflow to continue
                        "comments": comments,
                        "approver": user_id,
                        "timestamp": datetime.now(datetime.timezone.utc).isoformat()
                    }
                    # Call REAL Pub/Sub client proxy
                    message_id = await self.pubsub_client.publish(approval_signal_topic_path, json.dumps(signal_data).encode('utf-8'))
                    self.logger.info(f"Published APPROVAL signal for execution {review_id} via Pub/Sub proxy. Message ID: {message_id}")
                    status = "approved"; message = "Workflow approval signal sent."
                    response = self._create_miz_oki_response(None, status, {"review_id": review_id, "message": message})
                    response["metadata"]["processing_duration_ms"] = (time.monotonic() - start_time) * 1000
                    return response # Return early if workflow signal sent
                except Exception as e:
                    error_msg = f"Error sending approval signal for workflow {review_id} via Pub/Sub proxy: {e}"
                    self.logger.error(error_msg, exc_info=True)
                    errors.append({"code": "PUBSUB_SIGNAL_ERROR", "message": error_msg})
                    # Continue to try HDE update as fallback if applicable
            else:
                 errors.append({"code": "MISSING_DEPENDENCY", "message": "PubSub client proxy/SDK unavailable for workflow approval."})

       # --- Fallback/Alternative: Update HDE Decision Log via API proxy ---
       if self.hde_tool and hasattr(self.hde_tool, 'update_decision_log'):
           try:
               update_payload = {
                   'human_review_status': 'approved',
                   'human_reviewer': user_id,
                   'human_review_comments': comments,
                   'human_review_timestamp': datetime.now(datetime.timezone.utc).isoformat(),
                   'human_approval_data': approval_data # Store any data passed with approval
               }
               hde_request = {"payload": {"decision_id": review_id, "update_data": update_payload}, "request_id": f"hde_approve_{review_id}"}
               hde_response = await self.hde_tool.update_decision_log(request=hde_request) # Call API proxy

               if hde_response.get("status") == "success":
                   status = "approved"; message = "Decision approved in HDE log."
                   # If workflow approval failed earlier, status remains 'error' overall
                   if errors: status = "partial_failure"; message += " (Workflow signal failed)"
               else:
                   error_msg = f"HDE Tool API update failed for approval: {hde_response.get('error_details')}"
                   logger.error(error_msg)
                   errors.append({"code": "HDE_UPDATE_ERROR", "message": error_msg})
                   # If workflow also failed, status remains 'error'
                   if not is_workflow_exec_id: status = "failed" # If only HDE update was attempted and failed

           except Exception as e:
               error_msg = f"Error calling HDE Tool API proxy for approval {review_id}: {e}"
               self.logger.error(error_msg, exc_info=True)
               errors.append({"code": "HDE_PROXY_ERROR", "message": error_msg})
               if not is_workflow_exec_id: status = "failed" # If only HDE update was attempted and failed
       elif not is_workflow_exec_id: # If it wasn't a workflow ID and HDE tool is missing
            errors.append({"code": "MISSING_DEPENDENCY", "message": "HDE Tool proxy unavailable for decision approval."})

       # Determine final status based on errors
       if not errors and status == "approved": pass # All good
       elif errors and status == "approved": status = "partial_failure" # Workflow failed, HDE succeeded
       else: status = "failed" # Either both failed, or only HDE was tried and failed

       response = self._create_miz_oki_response(None, status, {"review_id": review_id, "message": message}, errors if errors else None)
       response["metadata"]["processing_duration_ms"] = (time.monotonic() - start_time) * 1000
       return response

   async def reject_action(self, user_id: str, review_id: str, reason: str, feedback_data: Optional[Dict] = None) -> Dict[str, Any]:
       """ API Method: Rejects a workflow (CANCEL via Vertex API proxy) or decision (via HDE API proxy) and submits feedback (via Validator API proxy). """
       self.logger.info(f"API: reject_action async for '{review_id}' by '{user_id}'. Reason: {reason}")
       start_time = time.monotonic()
       if not await self._check_permissions(user_id, "reject_action", review_id):
           return self._create_miz_oki_response(None, "permission_denied", errors=[{"message": "Permission denied"}])
       if not reason or not reason.strip():
           return self._create_miz_oki_response(None, "bad_request", errors=[{"message": "Rejection reason required."}])

       rejected = False; item_type = "unknown"; errors = []

       # --- Try Cancelling Vertex AI Workflow via REAL Client Proxy ---
       is_workflow_exec_id = review_id.startswith("projects/") and "/workflows/" in review_id and "/executions/" in review_id
       if is_workflow_exec_id:
            item_type = "workflow"
            if self.exec_client and VERTEX_WORKFLOWS_SDK_AVAILABLE:
                try:
                    request = CancelExecutionRequest(name=review_id) # Use real request type
                    await self.exec_client.cancel_execution(request=request) # Call proxy method
                    self.logger.info(f"Workflow execution {review_id} cancellation request sent by {user_id} via client proxy.")
                    rejected = True
                except google_api_exceptions.FailedPrecondition as fp_e:
                    # This likely means the workflow already finished (SUCCEEDED or FAILED)
                    logger.warning(f"Cannot cancel workflow {review_id} (likely already finished): {fp_e}.")
                    errors.append({"code": "CANCEL_PRECONDITION_FAILED", "message": "Workflow likely already finished."})
                except google_api_exceptions.NotFound:
                    logger.warning(f"Workflow execution {review_id} not found for cancellation.")
                    errors.append({"code": "WORKFLOW_NOT_FOUND", "message": "Workflow execution not found."})
                except google_api_exceptions.GoogleAPIError as api_e:
                    error_msg = f"API Error cancelling workflow {review_id} via client proxy: {api_e}"
                    self.logger.error(error_msg)
                    errors.append({"code": "VERTEX_API_ERROR", "message": error_msg})
                except Exception as e_cancel:
                    error_msg = f"Error sending cancel request for workflow {review_id} via client proxy: {e_cancel}"
                    self.logger.error(error_msg, exc_info=True)
                    errors.append({"code": "CANCEL_PROXY_ERROR", "message": error_msg})
            else:
                 errors.append({"code": "MISSING_DEPENDENCY", "message": "Vertex Executions client proxy/SDK unavailable for workflow cancellation."})

       # --- Fallback/Alternative: Update HDE Decision Log via API proxy ---
       if not rejected: # Try HDE update only if workflow cancellation wasn't attempted or failed
           item_type = "decision"
           if self.hde_tool and hasattr(self.hde_tool, 'update_decision_log'):
               try:
                   update_payload = {
                       'human_review_status': 'rejected',
                       'human_reviewer': user_id,
                       'human_rejection_reason': reason,
                       'human_review_timestamp': datetime.now(datetime.timezone.utc).isoformat()
                   }
                   hde_request = {"payload": {"decision_id": review_id, "update_data": update_payload}, "request_id": f"hde_reject_{review_id}"}
                   hde_response = await self.hde_tool.update_decision_log(request=hde_request) # Call API proxy

                   if hde_response.get("status") == "success":
                       rejected = True
                       logger.info(f"Decision {review_id} marked as rejected in HDE log.")
                   else:
                       error_msg = f"HDE Tool API update failed for rejection: {hde_response.get('error_details')}"
                       logger.error(error_msg)
                       errors.append({"code": "HDE_UPDATE_ERROR", "message": error_msg})
               except Exception as e:
                   error_msg = f"Error calling HDE Tool API proxy for rejection {review_id}: {e}"
                   self.logger.error(error_msg, exc_info=True)
                   errors.append({"code": "HDE_PROXY_ERROR", "message": error_msg})
           elif not is_workflow_exec_id: # Only error if HDE was the only option
                errors.append({"code": "MISSING_DEPENDENCY", "message": "HDE Tool proxy unavailable for decision rejection."})

       # --- Submit Feedback Async via Validator/SCF Tool API Proxy ---
       if rejected and feedback_data is not None:
           if self.validator_tool and hasattr(self.validator_tool, 'add_feedback'):
               feedback_entry = {
                   "feedback_id": f"fb_rej_{uuid.uuid4().hex[:8]}",
                   "user_id": user_id,
                   "item_type": item_type,
                   "component_id": review_id, # ID of the rejected item
                   "feedback": {"rejection_reason": reason, **(feedback_data or {})},
                   "type": "rejection_feedback", # Specific feedback type
                   "timestamp": datetime.now(datetime.timezone.utc).isoformat(),
                   "source": f"human_rejection:{user_id}"
               }
               try:
                   # Prepare MIZ OKI request for Validator Tool
                   val_request = {"payload": feedback_entry, "request_id": f"val_feedback_{review_id}"}
                   # Fire-and-forget the feedback submission
                   asyncio.create_task(self.validator_tool.add_feedback(input_data=val_request)) # Call API proxy
                   self.logger.info(f"Async feedback submission task created for rejected {review_id}.")
               except Exception as e:
                   error_msg = f"Error submitting feedback async via Validator API proxy for {review_id}: {e}"
                   self.logger.error(error_msg, exc_info=True)
                   errors.append({"code": "FEEDBACK_SUBMIT_ERROR", "message": error_msg})
           else:
                errors.append({"code": "MISSING_DEPENDENCY", "message": "Validator Tool proxy unavailable for feedback submission."})

       # Determine final status
       status = "rejected" if rejected else "failed"
       message = "Action rejected/cancelled." if rejected else "Rejection failed."
       response = self._create_miz_oki_response(None, status, {"review_id": review_id, "item_type": item_type, "message": message}, errors if errors else None)
       response["metadata"]["processing_duration_ms"] = (time.monotonic() - start_time) * 1000
       return response

   # --- Feedback Submission (Uses Real Validator/SCF Tool Proxy) ---
   async def submit_general_feedback(self, user_id: str, component_id: str, feedback_data: Dict) -> Dict[str, Any]:
       """ API Method: Submits general feedback via Validator/SCF Tool API proxy. """
       self.logger.info(f"API: submit_general_feedback async by '{user_id}' for component '{component_id}'")
       start_time = time.monotonic()
       if not await self._check_permissions(user_id, "submit_feedback"):
           return self._create_miz_oki_response(None, "permission_denied", errors=[{"message": "Permission denied"}])
       if not self.validator_tool or not hasattr(self.validator_tool, 'add_feedback'):
           return self._create_miz_oki_response(None, "service_unavailable", errors=[{"message": "Feedback system (Validator Tool proxy) unavailable."}])
       if not isinstance(feedback_data, dict) or not feedback_data:
            return self._create_miz_oki_response(None, "bad_request", errors=[{"message": "feedback_data (dict) is required."}])

       status = "pending"; response_payload = None; errors = []
       try:
            feedback_entry = {
                "feedback_id": f"fb_gen_{uuid.uuid4().hex[:8]}",
                "user_id": user_id,
                "component_id": component_id,
                "feedback": feedback_data,
                "type": feedback_data.get("type", "general"), # Allow type override in data
                "timestamp": datetime.now(datetime.timezone.utc).isoformat(),
                "source": f"human_general:{user_id}"
            }
            # Prepare MIZ OKI request for Validator Tool
            val_request = {"payload": feedback_entry, "request_id": f"val_gen_fb_{component_id}"}
            val_response = await self.validator_tool.add_feedback(input_data=val_request) # Call API proxy

            if val_response.get("status") == "success":
                status = "submitted"; response_payload = {"feedback_id": feedback_entry['feedback_id']}
            else:
                status = "failed"; errors.append({"code": "FEEDBACK_SUBMIT_FAILED", "message": f"Feedback submission failed in Validator Tool: {val_response.get('error_details')}"})
       except Exception as e:
            status = "internal_error"; errors.append({"code": "INTERNAL_ERROR", "message": f"Internal error submitting feedback: {e}"})
            self.logger.error(f"Error submitting general feedback via Validator API proxy: {e}", exc_info=True)

       response = self._create_miz_oki_response(None, status, response_payload, errors if errors else None)
       response["metadata"]["processing_duration_ms"] = (time.monotonic() - start_time) * 1000
       return response

   # --- Goal Management (Uses Real AGG Tool Proxy) ---
   async def get_active_goals(self, user_id: str, domain: Optional[str] = None) -> Dict[str, Any]:
       """ API Method: Fetches active goals via AGG Tool API proxy. """
       self.logger.info(f"API: get_active_goals async for user '{user_id}', domain '{domain}'")
       start_time = time.monotonic()
       if not await self._check_permissions(user_id, "view_goals"):
           return self._create_miz_oki_response(None, "permission_denied", errors=[{"message": "Permission denied"}])
       if not self.agg_tool or not hasattr(self.agg_tool, 'get_active_goals'):
           return self._create_miz_oki_response(None, "service_unavailable", errors=[{"message": "Goal generator (AGG Tool proxy) unavailable."}])

       status = "pending"; response_payload = None; errors = []
       try:
            # Prepare MIZ OKI request for AGG Tool
            agg_request = {"payload": {"domain_filter": domain}, "request_id": f"agg_get_goals_{user_id}"}
            agg_response = await self.agg_tool.get_active_goals(request=agg_request) # Call API proxy

            if agg_response.get("status") == "success":
                status = "success"; response_payload = {"active_goals": agg_response.get("payload", {}).get("goals", [])}
            else:
                status = "failed"; errors.append({"code": "AGG_FETCH_ERROR", "message": f"Error fetching goals from AGG Tool API: {agg_response.get('error_details')}"})
       except Exception as e:
            status = "internal_error"; errors.append({"code": "INTERNAL_ERROR", "message": f"Error fetching goals: {e}"})
            self.logger.error(f"Error calling AGG Tool API proxy get_active_goals: {e}", exc_info=True)

       response = self._create_miz_oki_response(None, status, response_payload, errors if errors else None)
       response["metadata"]["processing_duration_ms"] = (time.monotonic() - start_time) * 1000
       return response

   async def add_manual_goal(self, user_id: str, description: str, kpis: List[str], owner_agent: str="human", priority: float = 0.5, target_values: Optional[Dict] = None) -> Dict[str, Any]:
       """ API Method: Adds a manual goal via AGG Tool API proxy (which triggers planning). """
       self.logger.info(f"API: add_manual_goal async by '{user_id}': {description}")
       start_time = time.monotonic()
       if not await self._check_permissions(user_id, "add_goal"):
           return self._create_miz_oki_response(None, "permission_denied", errors=[{"message": "Permission denied"}])
       if not self.agg_tool or not hasattr(self.agg_tool, 'add_goal'):
           return self._create_miz_oki_response(None, "service_unavailable", errors=[{"message": "Goal generator (AGG Tool proxy) unavailable."}])
       if not description or not kpis:
            return self._create_miz_oki_response(None, "bad_request", errors=[{"message": "Description and KPIs are required."}])

       status = "pending"; response_payload = None; errors = []
       try:
            # Prepare MIZ OKI request for AGG Tool API proxy
            agg_request = {
                "payload": {
                    "description": description, "kpis": kpis, "owner_agent": owner_agent,
                    "priority": priority, "target_values": target_values,
                    "source": f"human:{user_id}"
                },
                "request_id": f"agg_add_goal_{user_id}"
            }
            agg_response = await self.agg_tool.add_goal(request=agg_request) # Call API proxy method

            if agg_response.get("status") == "success":
                status = "created"; response_payload = {"goal_id": agg_response.get("payload", {}).get("goal_id")}
            else:
                status = "failed"; errors.append({"code": "AGG_ADD_ERROR", "message": f"Failed to add goal via AGG Tool API: {agg_response.get('error_details')}"})
       except Exception as e:
            status = "internal_error"; errors.append({"code": "INTERNAL_ERROR", "message": f"Internal error adding goal: {e}"})
            self.logger.error(f"Error calling AGG Tool API proxy add_goal: {e}", exc_info=True)

       response = self._create_miz_oki_response(None, status, response_payload, errors if errors else None)
       response["metadata"]["processing_duration_ms"] = (time.monotonic() - start_time) * 1000
       return response

   # --- Explainability Access (Uses Real XAI Tool Proxy) ---
   async def get_decision_explanation(self, user_id: str, decision_or_execution_id: str, method: str = "chain_of_thought", role: Optional[str] = None) -> Dict[str, Any]:
       """ API Method: Gets explanation via XAI Tool API proxy. """
       self.logger.info(f"API: get_decision_explanation async for '{decision_or_execution_id}' by '{user_id}', method '{method}', role '{role}'")
       start_time = time.monotonic()
       # --- TODO: Resolve execution_id to decision_id if needed ---
       # This might involve querying the KG Tool API or checking workflow output mapping.
       decision_id = decision_or_execution_id
       # --- End TODO ---
       if not await self._check_permissions(user_id, "view_explanation", decision_id):
           return self._create_miz_oki_response(None, "permission_denied", errors=[{"message": "Permission denied"}])
       if not self.xai_tool:
           return self._create_miz_oki_response(None, "service_unavailable", errors=[{"message": "Explainability system (XAI Tool proxy) unavailable."}])

       status = "pending"; response_payload = None; errors = []
       try:
           # Prepare MIZ OKI request for XAI Tool API proxy
           xai_request = {
               "payload": {"decision_id": decision_id, "method": method, "role": role},
               "request_id": f"xai_explain_{decision_id}"
           }
           xai_response = None
           # Call appropriate XAI Tool API proxy method
           if role and hasattr(self.xai_tool, 'provide_role_based_explanation'):
               xai_response = await self.xai_tool.provide_role_based_explanation(input_data=xai_request)
           elif hasattr(self.xai_tool, 'explain_decision'):
               xai_response = await self.xai_tool.explain_decision(input_data=xai_request)
           else:
               errors.append({"code": "XAI_METHOD_UNAVAILABLE", "message": "Required XAI explanation method unavailable via proxy."})
               status = "config_error"

           if xai_response: # If a method was called
               if xai_response.get("status") == "success":
                   status = "success"; response_payload = {"explanation": xai_response.get("payload", {}).get("explanation")}
               else:
                   status = xai_response.get("status", "failed")
                   errors = xai_response.get("error_details", [{"code": "XAI_ERROR", "message": "Explanation failed/not found."}])
       except Exception as e:
            status = "internal_error"; errors.append({"code": "INTERNAL_ERROR", "message": f"Internal error getting explanation: {e}"})
            self.logger.error(f"Error calling XAI Tool API proxy: {e}", exc_info=True)

       response = self._create_miz_oki_response(None, status, response_payload, errors if errors else None)
       response["metadata"]["processing_duration_ms"] = (time.monotonic() - start_time) * 1000
       return response

   # --- Task Assignment (Uses REAL Vertex Workflow Client Proxy) ---
   async def assign_task_to_workflow(self, user_id: str, workflow_id: str, task_data: Dict, initial_context: Optional[Dict] = None) -> Dict[str, Any]:
       """ API Method: Triggers a specific Vertex AI Workflow asynchronously using the REAL client proxy. """
       self.logger.info(f"API: assign_task_to_workflow async '{workflow_id}' by '{user_id}'")
       start_time = time.monotonic()
       if not await self._check_permissions(user_id, "trigger_workflow", workflow_id):
           return self._create_miz_oki_response(None, "permission_denied", errors=[{"message": "Permission denied"}])
       if not self.exec_client or not VERTEX_WORKFLOWS_SDK_AVAILABLE:
           return self._create_miz_oki_response(None, "service_unavailable", errors=[{"message": "Workflow execution system (Vertex client proxy/SDK) unavailable."}])
       if not isinstance(task_data, dict):
            return self._create_miz_oki_response(None, "bad_request", errors=[{"message": "task_data must be a dictionary."}])

       status = "pending"; response_payload = None; errors = []
       try:
           parent = await self._get_parent_path()
           workflow_name = f"{parent}/workflows/{workflow_id}"

           # Prepare MIZ OKI payload for the workflow argument
           workflow_input_payload = {**(initial_context or {}), **task_data}
           miz_oki_input = {
               "miz_oki_version": self.config.miz_oki_schema_version,
               "request_id": f"req_wf_{uuid.uuid4().hex[:8]}",
               "trace_id": f"trace_api_{uuid.uuid4().hex[:8]}", # Generate trace ID if not provided
               "source_component": f"HumanAPI:{user_id}",
               "target_component": workflow_id,
               "payload": workflow_input_payload
           }
           execution_args = json.dumps(miz_oki_input)
           execution_proto = Execution(argument=execution_args) # Use real proto type
           request = CreateExecutionRequest(parent=workflow_name, execution=execution_proto) # Use real request type

           # Call the REAL client proxy method
           exec_response = await self.exec_client.create_execution(request=request)
           execution_name = exec_response.name # Full execution name

           status = "submitted"; response_payload = {"execution_name": execution_name}
           self.logger.info(f"Workflow '{workflow_id}' execution '{execution_name}' created by {user_id} via client proxy.")

       except google_api_exceptions.NotFound as nf_e:
            status = "not_found"; errors.append({"code": "WORKFLOW_NOT_FOUND", "message": f"Workflow '{workflow_id}' not found."}); logger.error(f"Workflow '{workflow_id}' not found: {nf_e}")
       except google_api_exceptions.GoogleAPIError as api_e:
            status = "api_error"; errors.append({"code": "VERTEX_API_ERROR", "message": str(api_e)}); logger.error(f"API Error triggering workflow '{workflow_id}' via client proxy: {api_e}")
       except Exception as e:
            status = "internal_error"; errors.append({"code": "INTERNAL_ERROR", "message": f"Internal error: {e}"}); logger.error(f"Error triggering workflow async '{workflow_id}' via client proxy: {e}", exc_info=True)

       response = self._create_miz_oki_response(None, status, response_payload, errors if errors else None)
       response["metadata"]["processing_duration_ms"] = (time.monotonic() - start_time) * 1000
       return response

   async def get_workflow_execution_details(self, user_id: str, execution_name: str) -> Dict[str, Any]:
       """ API Method: Retrieves workflow execution details async using the REAL client proxy. """
       self.logger.info(f"API: get_workflow_execution_details async for '{execution_name}' by '{user_id}'")
       start_time = time.monotonic()
       # Basic validation of execution_name format
       if not execution_name or not execution_name.startswith("projects/"):
            return self._create_miz_oki_response(None, "bad_request", errors=[{"message": "Invalid execution_name format."}])
       if not await self._check_permissions(user_id, "view_task", execution_name):
           return self._create_miz_oki_response(None, "permission_denied", errors=[{"message": "Permission denied"}])
       if not self.exec_client or not VERTEX_WORKFLOWS_SDK_AVAILABLE:
           return self._create_miz_oki_response(None, "service_unavailable", errors=[{"message": "Workflow status system (Vertex client proxy/SDK) unavailable."}])

       status = "pending"; response_payload = None; errors = []
       try:
           request = GetExecutionRequest(name=execution_name) # Use real request type
           execution_details_proto = await self.exec_client.get_execution(request=request) # Call proxy

           # Convert proto to dict for JSON response (handle potential errors)
           try:
               execution_details_dict = json_format.MessageToDict(execution_details_proto._pb, preserving_proto_field_name=True)
               # Convert state enum number to name string if possible
               if 'state' in execution_details_dict:
                    try: execution_details_dict['state'] = ExecutionState(execution_details_proto.state).name
                    except ValueError: execution_details_dict['state'] = f"UNKNOWN_STATE_{execution_details_proto.state}"
           except Exception as format_e:
                logger.error(f"Failed to format execution details proto to dict: {format_e}")
                # Fallback: return basic info
                execution_details_dict = {"name": execution_details_proto.name, "state": ExecutionState(execution_details_proto.state).name, "error": "Details formatting error"}

           status = "success"; response_payload = {"execution": execution_details_dict}

       except google_api_exceptions.NotFound:
            status = "not_found"; errors.append({"code": "EXECUTION_NOT_FOUND", "message": "Workflow execution not found."}); logger.warning(f"Workflow execution '{execution_name}' not found.")
       except google_api_exceptions.GoogleAPIError as api_e:
            status = "api_error"; errors.append({"code": "VERTEX_API_ERROR", "message": str(api_e)}); logger.error(f"API Error getting execution '{execution_name}' via client proxy: {api_e}")
       except Exception as e:
            status = "internal_error"; errors.append({"code": "INTERNAL_ERROR", "message": f"Internal error: {e}"}); logger.error(f"Error getting execution details async for '{execution_name}' via client proxy: {e}", exc_info=True)

       response = self._create_miz_oki_response(None, status, response_payload, errors if errors else None)
       response["metadata"]["processing_duration_ms"] = (time.monotonic() - start_time) * 1000
       return response

# --- Initialization (Conceptual - within API framework like FastAPI) ---
# human_agent_interface_api: Optional[HumanAgentInterfaceAPI] = None

# async def initialize_human_api():
#     global human_agent_interface_api
#     if not _config_obj or not _real_backend_tools: # Check if config and tool proxies are ready
#         logger.critical("Cannot initialize Human API: Config or backend tool proxies missing.")
#         return
#     try:
#         human_agent_interface_api = HumanAgentInterfaceAPI(
#             decision_engine_proxy=_hde_tool_proxy,
#             goal_generator_proxy=_agg_tool_proxy,
#             validator_proxy=_validator_tool_proxy,
#             xai_proxy=_xai_tool_proxy,
#             executions_client_proxy=_exec_client_proxy, # Pass REAL client proxy instance
#             pubsub_client_proxy=_pubsub_client_proxy, # Pass REAL client proxy instance
#             config=_config_obj
#         )
#         logger.info("HumanAgentInterfaceAPI initialized.")
#     except Exception as e:
#         logger.critical(f"HumanAgentInterfaceAPI initialization failed: {e}", exc_info=True)
#         human_agent_interface_api = None

# --- Example FastAPI Endpoint Definition (Conceptual) ---
# if _real_backend_tools and human_agent_interface_api: # Only define if dependencies are real
#     @app.get("/reviews/pending", response_model=MizOkiResponse)
#     async def get_reviews(user_id: str = Query(...), limit: int = Query(20)):
#         # Assumes user_id is obtained from auth middleware
#         result = await human_agent_interface_api.get_pending_reviews(user_id, limit)
#         # FastAPI handles converting dict to JSON response
#         return result
#
#     @app.post("/reviews/{review_id}/approve", response_model=MizOkiResponse)
#     async def approve(user_id: str = Query(...), review_id: str = Path(...), payload: Optional[Dict] = Body(None)):
#         result = await human_agent_interface_api.approve_action(user_id, review_id, payload.get("approval_data"), payload.get("comments"))
#         return result
#     # ... other endpoints ...

print("\n--- MIZ 3.0 Human-Agent Interface API Logic (Cell 16 - Reworked) ---")
print("Uses REAL Vertex AI Executions client proxy (if available) for workflow interactions.")
print("Calls backend tools via injected proxies (conceptual MIZ OKI APIs).")
print("Requires deployment within an API framework (e.g., FastAPI).")
print("--------------------------------------------------------------------")

SyntaxError: invalid syntax (<ipython-input-18-5ce5a48db9ae>, line 34)

In [19]:
# Cell 17: MLOps & Training Pipelines (Vertex AI Integration) (Reworked)
# Status: Uses google-auth for secure MoE registry API call. KFP v1 structure maintained.
#         Component logic remains placeholder. LLM pipeline needs separate definition.
#         Added conceptual orchestrator class for triggering/monitoring.

import kfp
from kfp import dsl
from kfp import compiler as v1_compiler
# Check if components are available, otherwise define dummies for compilation
try:
    from google_cloud_pipeline_components.v1.endpoint import EndpointCreateOp, ModelDeployOp
    from google_cloud_pipeline_components.v1.model import ModelUploadOp
    GCPC_V1_AVAILABLE = True
except ImportError:
    GCPC_V1_AVAILABLE = False
    logging.warning("google-cloud-pipeline-components v1 not found. Using dummy Ops for pipeline definition.")
    # Define dummy Ops for compilation if SDK is missing
    @dsl.component
    def EndpointCreateOp(project: str, location: str, display_name: str, labels: dict) -> dsl.OutputPath(dsl.Artifact): return dsl.OutputPath(dsl.Artifact)
    @dsl.component
    def ModelUploadOp(project: str, location: str, display_name: str, artifact_uri: str, serving_container_image_uri: str, labels: dict) -> dsl.OutputPath(dsl.Artifact): return dsl.OutputPath(dsl.Artifact)
    @dsl.component
    def ModelDeployOp(project: str, endpoint: dsl.Input[dsl.Artifact], model: dsl.Input[dsl.Artifact], deployed_model_display_name: str, machine_type: str, traffic_split: dict) -> dsl.OutputPath(dsl.Artifact): return dsl.OutputPath(dsl.Artifact)


import datetime
import os
import json
import logging
import random # Added for component simulation

# --- Ensure Logger and Config Vars are Available ---
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')
logger = logging.getLogger('MIZ-OKI.MLOps')

# Load config using CONFIG_OBJ from Cell 1 (reworked)
try:
    if 'CONFIG_OBJ' not in globals() or not CONFIG_OBJ: raise ValueError("CONFIG_OBJ not found or is None.")
    if not isinstance(CONFIG_OBJ, EnhancedConfig): raise NameError("CONFIG_OBJ is not an EnhancedConfig instance")

    PROJECT_ID = CONFIG_OBJ.gcp.project_id
    REGION = CONFIG_OBJ.gcp.region
    BUCKET_NAME = CONFIG_OBJ.gcp.gcs_bucket_name
    PIPELINE_ROOT = CONFIG_OBJ.mlops_pipeline_root
    # Use service_endpoints config for MoE registry URL
    MOE_REGISTRY_ENDPOINT = CONFIG_OBJ.service_endpoints.moe_registry_api_endpoint
    MLOPS_TRIGGER_TOPIC = CONFIG_OBJ.mlops_trigger_topic
    MLOPS_SERVING_IMAGE = CONFIG_OBJ.mlops_serving_image

    if not all([PROJECT_ID, REGION, BUCKET_NAME, PIPELINE_ROOT, MLOPS_SERVING_IMAGE]):
        raise ValueError("Essential GCP config (Project, Region, Bucket, Pipeline Root, Serving Image) missing for MLOps.")
    if not MOE_REGISTRY_ENDPOINT:
         logger.warning("MOE_REGISTRY_API_ENDPOINT not configured. Pipeline registration step will fail if included.")

    TIMESTAMP = datetime.datetime.now(datetime.timezone.utc).strftime("%Y%m%d%H%M%S")
    logger.info(f"MLOps Config: Project={PROJECT_ID}, Region={REGION}, PipelineRoot={PIPELINE_ROOT}")
    MLOPS_CONFIG_LOADED = True
except Exception as config_err:
   logger.critical(f"MLOps Configuration Error: {config_err}. Cannot define pipeline.", exc_info=True)
   # Define dummies to allow script execution but pipeline will be invalid
   PROJECT_ID, REGION, BUCKET_NAME, PIPELINE_ROOT, MOE_REGISTRY_ENDPOINT, MLOPS_TRIGGER_TOPIC, MLOPS_SERVING_IMAGE = "dummy", "dummy", "dummy", "gs://dummy/pipelines", None, "dummy", "dummy-image"
   TIMESTAMP = "dummy-ts"
   MLOPS_CONFIG_LOADED = False

# --- Pipeline Components (Placeholders - Need Real Implementation) ---

# @kfp.dsl.component(...) # Keep decorator
# def prepare_data_op(...): ... # Keep signature and placeholder logic from previous rework

# @kfp.dsl.component(...) # Keep decorator
# def train_expert_model_op(...): ... # Keep signature and placeholder logic from previous rework

# @kfp.dsl.component(...) # Keep decorator
# def evaluate_model_op(...): ... # Keep signature and placeholder logic from previous rework

# --- [Refined update_moe_manager_op definition - Uses google-auth] ---
@kfp.dsl.component(
    base_image="python:3.10", # Use a standard Python image
    packages_to_install=["google-cloud-aiplatform", "requests", "google-auth"] # Add necessary packages
)
def update_moe_manager_op(
    expert_id: str, # Unique ID for the expert (e.g., model display name)
    model_resource_name: str, # Full Vertex AI Model resource name (projects/.../models/...)
    endpoint_resource_name: str, # Full Vertex AI Endpoint resource name (projects/.../endpoints/...)
    task_type: str, # e.g., 'classification', 'forecasting', 'recommendation'
    domain: str, # e.g., 'roas', 'churn', 'product_similarity'
    metrics_json: dsl.Input[dsl.Artifact], # Input artifact containing evaluation metrics JSON file
    moe_registry_api_endpoint: str, # URL of the deployed MoE Registry API service
    project: str, # GCP Project ID (for logging/context)
    location: str # GCP Region (for logging/context)
):
   """
   Calls the MoE Registry API (deployed as a separate service, e.g., Cloud Run)
   to register or update an expert model using google-auth for secure invocation.
   """
   import logging
   import json
   import os
   import requests
   import google.auth.transport.requests
   import google.oauth2.id_token

   logging.basicConfig(level=logging.INFO, format='%(levelname)s: %(asctime)s: %(message)s')
   logger = logging.getLogger('UpdateMoEManagerOp')

   if not moe_registry_api_endpoint:
       logger.critical("MoE Registry API endpoint missing! Cannot register expert.")
       raise ValueError("moe_registry_api_endpoint parameter is required.")

   logger.info(f"Updating MoE Registry for Expert: {expert_id} via API: {moe_registry_api_endpoint}")

   # Load evaluation metrics from the input artifact
   eval_metrics = {}
   try:
       with open(metrics_json.path, 'r') as f:
           eval_metrics = json.load(f)
       logger.info(f"Loaded metrics: {eval_metrics}")
   except Exception as metrics_e:
       logger.warning(f"Could not load metrics from {metrics_json.path}: {metrics_e}")

   # Construct the payload for the MoE Registry API
   registry_payload = {
       "expert_id": expert_id,
       "vertex_model_name": model_resource_name,
       "vertex_endpoint_name": endpoint_resource_name,
       "task_type": task_type,
       "domain": domain,
       "status": "active", # Mark as active upon successful deployment
       "evaluation_metrics": eval_metrics,
       "pipeline_job_name": os.environ.get('KFP_RUN_ID', 'unknown_kfp_run_id'), # Get KFP run ID if available
       "last_updated": datetime.datetime.now(datetime.timezone.utc).isoformat()
   }

   auth_token = None
   try:
       # Obtain an OIDC ID token to authenticate the call to the MoE Registry API
       # Assumes the MoE Registry API is deployed on Cloud Run/Functions and allows authenticated invocations
       # The pipeline's service account needs the 'roles/run.invoker' role on the MoE Registry service.
       auth_req = google.auth.transport.requests.Request()
       id_token = google.oauth2.id_token.fetch_id_token(auth_req, moe_registry_api_endpoint)
       auth_token = f"Bearer {id_token}"
       logger.info(f"Fetched Google ID token for audience: {moe_registry_api_endpoint}")
   except Exception as auth_e:
       logger.error(f"Failed to get Google ID token: {auth_e}. Check pipeline SA permissions.", exc_info=True)
       # Decide whether to fail the pipeline or proceed without registration
       raise RuntimeError(f"Auth failed for MoE API: {auth_e}") from auth_e

   headers = {"Authorization": auth_token, "Content-Type": "application/json"}
   # Assuming the MoE Registry API uses PUT for create/update on /experts/{expert_id}
   expert_api_url = f"{moe_registry_api_endpoint.rstrip('/')}/experts/{expert_id}"

   logger.info(f"Calling MoE Registry API via PUT: {expert_api_url}")
   try:
       # Make the authenticated API call
       response = requests.put(expert_api_url, headers=headers, json=registry_payload, timeout=90) # 90s timeout
       logger.info(f"MoE Registry API Response Status: {response.status_code}")
       response.raise_for_status() # Raise HTTPError for bad responses (4xx or 5xx)
       logger.info(f"Successfully registered/updated expert '{expert_id}' in MoE registry.")
   except requests.exceptions.RequestException as api_e:
       logger.error(f"MoE Registry API call failed: {api_e}")
       # Log response body if available for debugging
       try: logger.error(f"API Response Body: {api_e.response.text}")
       except: pass
       raise RuntimeError(f"Failed to update MoE registry via API: {api_e}") from api_e
   except Exception as e:
       logger.error(f"Update MoE Manager op failed unexpectedly: {e}", exc_info=True)
       raise

# --- Define the Training Pipeline (Structure remains same, uses refined component) ---
@kfp.dsl.pipeline(
    name="miz3-expert-training-pipeline-v1deploy-apireg-reworked",
    description="Trains, evaluates, deploys MIZ 3.0 expert models, registers via API.",
    pipeline_root=PIPELINE_ROOT
)
def expert_training_pipeline_v1deploy_apireg(
   project: str = PROJECT_ID,
   location: str = REGION,
   source_uri_or_query: str, # GCS path (e.g., gs://...) or BQ table (project.dataset.table)
   target_column: str, # Name of the target variable column
   model_display_name_prefix: str, # Prefix for the deployed Vertex AI Model name
   task_type: str, # e.g., 'classification', 'forecasting', 'recommendation'
   expert_domain: str, # e.g., 'roas', 'churn', 'product_similarity'
   # Ensure MOE_REGISTRY_ENDPOINT is passed or available globally/via config
   moe_registry_api_endpoint: str = MOE_REGISTRY_ENDPOINT,
   data_source_type: str = 'gcs', # 'gcs' or 'bq'
   output_shape_json: str = '[1]', # JSON string representing output shape, e.g., '[1]' for regression, '[num_classes]' for classification
   hyperparameters_json: str = '{}', # JSON string of hyperparameters for training component
   epochs: int = 10, # Example hyperparameter
   batch_size: int = 32, # Example hyperparameter
   serving_image: str = MLOPS_SERVING_IMAGE, # Serving container image URI
   deployment_threshold_metric: str = "accuracy", # Metric used for deployment condition
   deployment_threshold_value: float = 0.75, # Threshold value for deployment
   endpoint_display_name_prefix: str = "miz3-shared-expert-endpoint", # Prefix for Vertex AI Endpoint
   deploy_machine_type: str = "n1-standard-4", # Machine type for deployment
   deploy_traffic_split_json: str = '{"0": 100}', # Deploy with 100% traffic initially
):
    # --- Check if GCPC SDK is available before using its Ops ---
    if not GCPC_V1_AVAILABLE:
        raise RuntimeError("google-cloud-pipeline-components v1 SDK not found. Cannot define pipeline using GCPC Ops.")

    # Generate unique names for model and endpoint using pipeline job ID
    run_timestamp = dsl.PIPELINE_JOB_ID_PLACEHOLDER # KFP v1 placeholder
    model_display_name = f"{model_display_name_prefix}-{expert_domain}-{run_timestamp}"
    endpoint_display_name = f"{endpoint_display_name_prefix}-{expert_domain}" # Shared endpoint per domain

    # 1. Prepare Data
    prepare_data_task = prepare_data_op(
        project_id=project,
        bucket_name=BUCKET_NAME, # Assuming BUCKET_NAME is globally available from config
        data_source_type=data_source_type,
        source_uri_or_query=source_uri_or_query,
        target_column=target_column
    ).set_display_name("Prepare Data")

    # 2. Train Model
    train_model_task = train_expert_model_op(
        train_data=prepare_data_task.outputs["output_train_uri"],
        input_scaler_uri=prepare_data_task.outputs["output_scaler_uri"],
        target_column=target_column,
        model_id_prefix=model_display_name_prefix, # Pass prefix
        model_version=run_timestamp, # Use run ID as version
        task_type=task_type,
        output_shape_json=output_shape_json,
        hyperparameters_json=hyperparameters_json,
        epochs=epochs,
        batch_size=batch_size
    ).set_display_name(f"Train Expert ({task_type}/{expert_domain})")

    # 3. Evaluate Model
    evaluate_model_task = evaluate_model_op(
        test_data=prepare_data_task.outputs["output_test_uri"],
        model=train_model_task.outputs["model_dir"],
        input_scaler_uri=prepare_data_task.outputs["output_scaler_uri"],
        target_column=target_column
    ).set_display_name("Evaluate Model")

    # 4. Conditional Deployment & Registration
    with dsl.Condition(
        evaluate_model_task.outputs["kfp_metrics"].outputs[deployment_threshold_metric] >= deployment_threshold_value,
        name="deploy-condition"
    ):
        # 4a. Upload Model to Vertex AI Model Registry
        model_upload_op = ModelUploadOp(
            project=project,
            location=location,
            display_name=model_display_name,
            artifact_uri=train_model_task.outputs["model_dir"].uri, # Use .uri for artifact path
            serving_container_image_uri=serving_image,
            labels={"miz_pipeline_run_id": run_timestamp, "miz_expert_domain": expert_domain, "miz_task_type": task_type}
        ).set_display_name("Upload Model")

        # 4b. Create or Get Endpoint (shared endpoint per domain)
        endpoint_create_op = EndpointCreateOp(
            project=project,
            location=location,
            display_name=endpoint_display_name, # Use consistent name for shared endpoint
            labels={"miz_app": "bgi_platform", "miz_domain": expert_domain}
        ).set_display_name(f"Create/Get Endpoint ({expert_domain})")

        # 4c. Deploy Model to Endpoint
        model_deploy_op = ModelDeployOp(
            project=project,
            endpoint=endpoint_create_op.outputs["endpoint"],
            model=model_upload_op.outputs["model"],
            deployed_model_display_name=model_display_name, # Unique name for this deployment
            machine_type=deploy_machine_type,
            traffic_split=json.loads(deploy_traffic_split_json) # Deploy with specified traffic
        ).set_display_name(f"Deploy Model ({deploy_traffic_split_json} Traffic)")

        # 4d. Update MoE Registry via API Call (using the refined component)
        # Ensure the endpoint parameter is passed correctly
        update_moe_task = update_moe_manager_op(
            project=project,
            location=location,
            expert_id=model_display_name, # Use the unique model display name as expert ID
            model_resource_name=model_upload_op.outputs["model"].resource_name, # Pass full resource name
            endpoint_resource_name=endpoint_create_op.outputs["endpoint"].resource_name, # Pass full resource name
            task_type=task_type,
            domain=expert_domain,
            metrics_json=evaluate_model_task.outputs["metrics_output_path"],
            moe_registry_api_endpoint=moe_registry_api_endpoint # Pass the API endpoint URL
        ).after(model_deploy_op).set_display_name("Update MoE Registry API") # Run after deployment

# --- Compile Pipeline ---
pipeline_filename = None
if MLOPS_CONFIG_LOADED:
    pipeline_filename = f"miz3_expert_training_pipeline_{TIMESTAMP}.json"
    try:
       if MOE_REGISTRY_ENDPOINT is None:
           logger.warning("MOE_REGISTRY_API_ENDPOINT is not set. The 'update_moe_manager_op' step will fail if included in execution.")
           # Optionally remove the MoE update step if endpoint is missing, or let it fail during execution
           # For compilation, we might need to pass a dummy value if the parameter is mandatory
           # However, the component logic handles the check, so compilation might proceed.

       # Use KFP v1 compiler explicitly
       v1_compiler.Compiler(mode=kfp.dsl.PipelineExecutionMode.V1_LEGACY).compile(
           pipeline_func=expert_training_pipeline_v1deploy_apireg,
           package_path=pipeline_filename
       )
       logger.info(f"Pipeline compiled successfully to {pipeline_filename}")
    except Exception as compile_e:
       logger.critical("Pipeline compilation failed!", exc_info=True)
       pipeline_filename = None
else:
    logger.critical("MLOps configuration failed to load. Skipping pipeline compilation.")


# --- Conceptual Class for Triggering/Monitoring Pipelines via MIZ OKI Events ---
class TrainingPipelineOrchestrator:
    """
    Conceptual service that listens to Pub/Sub triggers (MIZ OKI format)
    and launches/monitors Vertex AI Pipeline Jobs.
    """
    def __init__(self, config: EnhancedConfig, pipeline_client: Optional[Any] = None, pubsub_client: Optional[Any] = None):
        # In a real service, pipeline_client would be Vertex AI Pipelines client
        # pubsub_client would be Pub/Sub client
        self.config = config
        self.pipeline_client = pipeline_client # e.g., aiplatform.PipelineJob
        self.pubsub_client = pubsub_client # e.g., pubsub_v1.PublisherClient
        self.logger = logging.getLogger("MIZ-OKI.TrainingPipelineOrchestrator")
        if not self.pipeline_client: self.logger.warning("Vertex AI Pipeline client not provided.")
        if not self.pubsub_client: self.logger.warning("Pub/Sub client not provided.")

    async def handle_trigger_event(self, miz_oki_event: Dict) -> Dict:
        """Handles a MIZ OKI event requesting a pipeline launch."""
        trace_id = miz_oki_event.get("trace_id", f"mlops_trigger_{uuid.uuid4().hex[:8]}")
        response_status = "failed"; response_payload = None; errors = []
        try:
            if not self.pipeline_client: raise RuntimeError("Pipeline client unavailable.")
            event_payload = miz_oki_event.get("payload", {})
            pipeline_name = event_payload.get("pipeline_name")
            pipeline_params = event_payload.get("parameters", {})
            if not pipeline_name or not pipeline_params:
                raise ValueError("Missing 'pipeline_name' or 'parameters' in trigger event payload.")

            # --- TODO: Map pipeline_name to the compiled template path (e.g., from GCS) ---
            template_path = f"{PIPELINE_ROOT}/{pipeline_name}.json" # Example mapping
            if not template_path: raise ValueError(f"Cannot find template for pipeline: {pipeline_name}")
            # --- End TODO ---

            job_id = f"{pipeline_name}-run-{datetime.datetime.now(datetime.timezone.utc).strftime('%Y%m%d%H%M%S')}"
            logger.info(f"Submitting Vertex AI Pipeline job '{job_id}' from template '{template_path}' with params: {pipeline_params}")

            # Use Vertex AI SDK to submit the job (this part is synchronous in current SDK)
            # Needs to be run in a thread for async context
            def _submit_job():
                job = aiplatform.PipelineJob(
                    display_name=job_id,
                    template_path=template_path,
                    pipeline_root=PIPELINE_ROOT,
                    parameter_values=pipeline_params,
                    project=self.config.gcp.project_id,
                    location=self.config.gcp.region,
                    # enable_caching=True, # Optional
                )
                job.submit() # Submits the job
                return job.resource_name # Return the job resource name

            job_resource_name = await asyncio.to_thread(_submit_job)
            logger.info(f"Successfully submitted Vertex AI Pipeline job: {job_resource_name}")
            response_status = "submitted"; response_payload = {"job_resource_name": job_resource_name, "job_id_prefix": job_id}

        except Exception as e:
            logger.error(f"Pipeline launch error from trigger event: {e}", exc_info=True)
            errors.append({"code": "PIPELINE_SUBMIT_ERROR", "message": str(e)})
            response_status = "error"

        # Return a MIZ OKI response (optional, depends if caller needs confirmation)
        return {
            "miz_oki_version": self.config.miz_oki_schema_version,
            "request_id": miz_oki_event.get("request_id"), "trace_id": trace_id,
            "timestamp": datetime.now(datetime.timezone.utc).isoformat(),
            "source_component": "TrainingPipelineOrchestrator",
            "status": response_status, "payload": response_payload, "error_details": errors
        }

    async def handle_completion_event(self, miz_oki_event: Dict) -> Dict:
        """Handles Pub/Sub event for completed training, triggers post-processing/evaluation."""
        # This would typically be triggered by the pipeline itself publishing to a topic upon completion.
        # The event payload should contain job details (resource name, status, outputs).
        trace_id = miz_oki_event.get("trace_id", f"mlops_complete_{uuid.uuid4().hex[:8]}")
        response_status = "failed"; response_payload = None; errors = []
        try:
            job_details = miz_oki_event.get("payload", {})
            job_resource_name = job_details.get("job_resource_name")
            job_status = job_details.get("status") # e.g., 'PIPELINE_STATE_SUCCEEDED', 'PIPELINE_STATE_FAILED'
            if not job_resource_name or not job_status:
                raise ValueError("Missing job details in completion event.")

            logger.info(f"Handling pipeline completion event for job: {job_resource_name}, Status: {job_status}")

            if job_status == 'PIPELINE_STATE_SUCCEEDED':
                # --- TODO: Trigger next steps ---
                # - Call MoE Registry update (if not done in pipeline)
                # - Trigger evaluation workflows
                # - Update KG with model metadata
                # Example: Publish event for LI Tool
                # if self.pubsub_client:
                #     li_event = {...}
                #     await self.pubsub_client.publish(...)
                # --- End TODO ---
                response_status = "success"; response_payload = {"job_resource_name": job_resource_name, "action": "Post-processing triggered (placeholder)"}
            else:
                # Handle failed pipeline
                logger.error(f"Pipeline job {job_resource_name} failed. Status: {job_status}. Details: {job_details.get('error')}")
                response_status = "failed"; errors.append({"code": "PIPELINE_FAILED", "message": f"Job {job_resource_name} failed.", "details": job_details.get('error')})
                # --- TODO: Trigger alerting or remediation ---

        except Exception as e:
            logger.error(f"Pipeline completion handler error: {e}", exc_info=True)
            errors.append({"code": "HANDLER_ERROR", "message": str(e)})
            response_status = "error"

        return {
            "miz_oki_version": self.config.miz_oki_schema_version,
            "request_id": miz_oki_event.get("request_id"), "trace_id": trace_id,
            "timestamp": datetime.now(datetime.timezone.utc).isoformat(),
            "source_component": "TrainingPipelineOrchestrator",
            "status": response_status, "payload": response_payload, "error_details": errors
        }


# --- Notes on Execution ---
logger.info("--- MLOps Pipeline Execution Notes (Reworked) ---")
if MLOPS_CONFIG_LOADED and pipeline_filename:
    logger.info(f"1. Upload compiled pipeline: '{pipeline_filename}' to GCS bucket '{BUCKET_NAME}' (e.g., under {PIPELINE_ROOT}).")
    logger.info(f"2. Setup Cloud Function/Run service triggered by Pub/Sub topic: '{MLOPS_TRIGGER_TOPIC}'.")
    logger.info("3. The trigger service should instantiate TrainingPipelineOrchestrator and call handle_trigger_event.")
    logger.info(f"4. Ensure MoE Registry API ('{MOE_REGISTRY_ENDPOINT or 'endpoint_not_set'}') is deployed & pipeline SA has 'roles/run.invoker'.")
    logger.info("5. Implement REAL logic in pipeline components (prepare_data_op, train_expert_model_op, evaluate_model_op).")
    logger.info("6. Define SEPARATE KFP pipelines for LLM Fine-Tuning/Distillation (likely using CustomJobOp or specific GCPC components).")
else:
     logger.error("MLOps pipeline configuration incomplete or compilation failed. Deployment steps cannot be determined.")

print(f"\n--- MIZ 3.0 MLOps Pipeline Definition Compiled (Cell 17 - Reworked) ---")
if pipeline_filename: print(f"Pipeline definition saved to: {pipeline_filename}")
else: print("Pipeline compilation FAILED. Check logs.")
print("-----------------------------------------------------------------")

SyntaxError: non-default argument follows default argument (<ipython-input-19-287087a799ae>, line 178)

In [20]:
# Cell 17: MLOps & Training Pipelines (Vertex AI Integration) (Reworked)
# Status: Uses google-auth for secure MoE registry API call. KFP v1 structure maintained.
#         Component logic remains placeholder. LLM pipeline needs separate definition.
#         Added conceptual orchestrator class for triggering/monitoring.

import kfp
from kfp import dsl
from kfp import compiler as v1_compiler
# Check if components are available, otherwise define dummies for compilation
try:
    from google_cloud_pipeline_components.v1.endpoint import EndpointCreateOp, ModelDeployOp
    from google_cloud_pipeline_components.v1.model import ModelUploadOp
    GCPC_V1_AVAILABLE = True
except ImportError:
    GCPC_V1_AVAILABLE = False
    logging.warning("google-cloud-pipeline-components v1 not found. Using dummy Ops for pipeline definition.")
    # Define dummy Ops for compilation if SDK is missing
    @dsl.component
    def EndpointCreateOp(project: str, location: str, display_name: str, labels: dict) -> dsl.OutputPath(dsl.Artifact): return dsl.OutputPath(dsl.Artifact)
    @dsl.component
    def ModelUploadOp(project: str, location: str, display_name: str, artifact_uri: str, serving_container_image_uri: str, labels: dict) -> dsl.OutputPath(dsl.Artifact): return dsl.OutputPath(dsl.Artifact)
    @dsl.component
    def ModelDeployOp(project: str, endpoint: dsl.Input[dsl.Artifact], model: dsl.Input[dsl.Artifact], deployed_model_display_name: str, machine_type: str, traffic_split: dict) -> dsl.OutputPath(dsl.Artifact): return dsl.OutputPath(dsl.Artifact)


import datetime
import os
import json
import logging
import random # Added for component simulation

# --- Ensure Logger and Config Vars are Available ---
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')
logger = logging.getLogger('MIZ-OKI.MLOps')

# Load config using CONFIG_OBJ from Cell 1 (reworked)
try:
    if 'CONFIG_OBJ' not in globals() or not CONFIG_OBJ: raise ValueError("CONFIG_OBJ not found or is None.")
    if not isinstance(CONFIG_OBJ, EnhancedConfig): raise NameError("CONFIG_OBJ is not an EnhancedConfig instance")

    PROJECT_ID = CONFIG_OBJ.gcp.project_id
    REGION = CONFIG_OBJ.gcp.region
    BUCKET_NAME = CONFIG_OBJ.gcp.gcs_bucket_name
    PIPELINE_ROOT = CONFIG_OBJ.mlops_pipeline_root
    # Use service_endpoints config for MoE registry URL
    MOE_REGISTRY_ENDPOINT = CONFIG_OBJ.service_endpoints.moe_registry_api_endpoint
    MLOPS_TRIGGER_TOPIC = CONFIG_OBJ.mlops_trigger_topic
    MLOPS_SERVING_IMAGE = CONFIG_OBJ.mlops_serving_image

    if not all([PROJECT_ID, REGION, BUCKET_NAME, PIPELINE_ROOT, MLOPS_SERVING_IMAGE]):
        raise ValueError("Essential GCP config (Project, Region, Bucket, Pipeline Root, Serving Image) missing for MLOps.")
    if not MOE_REGISTRY_ENDPOINT:
         logger.warning("MOE_REGISTRY_API_ENDPOINT not configured. Pipeline registration step will fail if included.")

    TIMESTAMP = datetime.datetime.now(datetime.timezone.utc).strftime("%Y%m%d%H%M%S")
    logger.info(f"MLOps Config: Project={PROJECT_ID}, Region={REGION}, PipelineRoot={PIPELINE_ROOT}")
    MLOPS_CONFIG_LOADED = True
except Exception as config_err:
   logger.critical(f"MLOps Configuration Error: {config_err}. Cannot define pipeline.", exc_info=True)
   # Define dummies to allow script execution but pipeline will be invalid
   PROJECT_ID, REGION, BUCKET_NAME, PIPELINE_ROOT, MOE_REGISTRY_ENDPOINT, MLOPS_TRIGGER_TOPIC, MLOPS_SERVING_IMAGE = "dummy", "dummy", "dummy", "gs://dummy/pipelines", None, "dummy", "dummy-image"
   TIMESTAMP = "dummy-ts"
   MLOPS_CONFIG_LOADED = False

# --- Pipeline Components (Placeholders - Need Real Implementation) ---

# @kfp.dsl.component(...) # Keep decorator
# def prepare_data_op(...): ... # Keep signature and placeholder logic from previous rework

# @kfp.dsl.component(...) # Keep decorator
# def train_expert_model_op(...): ... # Keep signature and placeholder logic from previous rework

# @kfp.dsl.component(...) # Keep decorator
# def evaluate_model_op(...): ... # Keep signature and placeholder logic from previous rework

# --- [Refined update_moe_manager_op definition - Uses google-auth] ---
@kfp.dsl.component(
    base_image="python:3.10", # Use a standard Python image
    packages_to_install=["google-cloud-aiplatform", "requests", "google-auth"] # Add necessary packages
)
def update_moe_manager_op(
    expert_id: str, # Unique ID for the expert (e.g., model display name)
    model_resource_name: str, # Full Vertex AI Model resource name (projects/.../models/...)
    endpoint_resource_name: str, # Full Vertex AI Endpoint resource name (projects/.../endpoints/...)
    task_type: str, # e.g., 'classification', 'forecasting', 'recommendation'
    domain: str, # e.g., 'roas', 'churn', 'product_similarity'
    metrics_json: dsl.Input[dsl.Artifact], # Input artifact containing evaluation metrics JSON file
    moe_registry_api_endpoint: str, # URL of the deployed MoE Registry API service
    project: str, # GCP Project ID (for logging/context)
    location: str # GCP Region (for logging/context)
):
   """
   Calls the MoE Registry API (deployed as a separate service, e.g., Cloud Run)
   to register or update an expert model using google-auth for secure invocation.
   """
   import logging
   import json
   import os
   import requests
   import google.auth.transport.requests
   import google.oauth2.id_token

   logging.basicConfig(level=logging.INFO, format='%(levelname)s: %(asctime)s: %(message)s')
   logger = logging.getLogger('UpdateMoEManagerOp')

   if not moe_registry_api_endpoint:
       logger.critical("MoE Registry API endpoint missing! Cannot register expert.")
       raise ValueError("moe_registry_api_endpoint parameter is required.")

   logger.info(f"Updating MoE Registry for Expert: {expert_id} via API: {moe_registry_api_endpoint}")

   # Load evaluation metrics from the input artifact
   eval_metrics = {}
   try:
       with open(metrics_json.path, 'r') as f:
           eval_metrics = json.load(f)
       logger.info(f"Loaded metrics: {eval_metrics}")
   except Exception as metrics_e:
       logger.warning(f"Could not load metrics from {metrics_json.path}: {metrics_e}")

   # Construct the payload for the MoE Registry API
   registry_payload = {
       "expert_id": expert_id,
       "vertex_model_name": model_resource_name,
       "vertex_endpoint_name": endpoint_resource_name,
       "task_type": task_type,
       "domain": domain,
       "status": "active", # Mark as active upon successful deployment
       "evaluation_metrics": eval_metrics,
       "pipeline_job_name": os.environ.get('KFP_RUN_ID', 'unknown_kfp_run_id'), # Get KFP run ID if available
       "last_updated": datetime.datetime.now(datetime.timezone.utc).isoformat()
   }

   auth_token = None
   try:
       # Obtain an OIDC ID token to authenticate the call to the MoE Registry API
       # Assumes the MoE Registry API is deployed on Cloud Run/Functions and allows authenticated invocations
       # The pipeline's service account needs the 'roles/run.invoker' role on the MoE Registry service.
       auth_req = google.auth.transport.requests.Request()
       id_token = google.oauth2.id_token.fetch_id_token(auth_req, moe_registry_api_endpoint)
       auth_token = f"Bearer {id_token}"
       logger.info(f"Fetched Google ID token for audience: {moe_registry_api_endpoint}")
   except Exception as auth_e:
       logger.error(f"Failed to get Google ID token: {auth_e}. Check pipeline SA permissions.", exc_info=True)
       # Decide whether to fail the pipeline or proceed without registration
       raise RuntimeError(f"Auth failed for MoE API: {auth_e}") from auth_e

   headers = {"Authorization": auth_token, "Content-Type": "application/json"}
   # Assuming the MoE Registry API uses PUT for create/update on /experts/{expert_id}
   expert_api_url = f"{moe_registry_api_endpoint.rstrip('/')}/experts/{expert_id}"

   logger.info(f"Calling MoE Registry API via PUT: {expert_api_url}")
   try:
       # Make the authenticated API call
       response = requests.put(expert_api_url, headers=headers, json=registry_payload, timeout=90) # 90s timeout
       logger.info(f"MoE Registry API Response Status: {response.status_code}")
       response.raise_for_status() # Raise HTTPError for bad responses (4xx or 5xx)
       logger.info(f"Successfully registered/updated expert '{expert_id}' in MoE registry.")
   except requests.exceptions.RequestException as api_e:
       logger.error(f"MoE Registry API call failed: {api_e}")
       # Log response body if available for debugging
       try: logger.error(f"API Response Body: {api_e.response.text}")
       except: pass
       raise RuntimeError(f"Failed to update MoE registry via API: {api_e}") from api_e
   except Exception as e:
       logger.error(f"Update MoE Manager op failed unexpectedly: {e}", exc_info=True)
       raise

# --- Define the Training Pipeline (Structure remains same, uses refined component) ---
@kfp.dsl.pipeline(
    name="miz3-expert-training-pipeline-v1deploy-apireg-reworked",
    description="Trains, evaluates, deploys MIZ 3.0 expert models, registers via API.",
    pipeline_root=PIPELINE_ROOT
)
def expert_training_pipeline_v1deploy_apireg(
   project: str = PROJECT_ID,
   location: str = REGION,
   source_uri_or_query: str, # GCS path (e.g., gs://...) or BQ table (project.dataset.table)
   target_column: str, # Name of the target variable column
   model_display_name_prefix: str, # Prefix for the deployed Vertex AI Model name
   task_type: str, # e.g., 'classification', 'forecasting', 'recommendation'
   expert_domain: str, # e.g., 'roas', 'churn', 'product_similarity'
   # Ensure MOE_REGISTRY_ENDPOINT is passed or available globally/via config
   moe_registry_api_endpoint: str = MOE_REGISTRY_ENDPOINT,
   data_source_type: str = 'gcs', # 'gcs' or 'bq'
   output_shape_json: str = '[1]', # JSON string representing output shape, e.g., '[1]' for regression, '[num_classes]' for classification
   hyperparameters_json: str = '{}', # JSON string of hyperparameters for training component
   epochs: int = 10, # Example hyperparameter
   batch_size: int = 32, # Example hyperparameter
   serving_image: str = MLOPS_SERVING_IMAGE, # Serving container image URI
   deployment_threshold_metric: str = "accuracy", # Metric used for deployment condition
   deployment_threshold_value: float = 0.75, # Threshold value for deployment
   endpoint_display_name_prefix: str = "miz3-shared-expert-endpoint", # Prefix for Vertex AI Endpoint
   deploy_machine_type: str = "n1-standard-4", # Machine type for deployment
   deploy_traffic_split_json: str = '{"0": 100}', # Deploy with 100% traffic initially
):
    # --- Check if GCPC SDK is available before using its Ops ---
    if not GCPC_V1_AVAILABLE:
        raise RuntimeError("google-cloud-pipeline-components v1 SDK not found. Cannot define pipeline using GCPC Ops.")

    # Generate unique names for model and endpoint using pipeline job ID
    run_timestamp = dsl.PIPELINE_JOB_ID_PLACEHOLDER # KFP v1 placeholder
    model_display_name = f"{model_display_name_prefix}-{expert_domain}-{run_timestamp}"
    endpoint_display_name = f"{endpoint_display_name_prefix}-{expert_domain}" # Shared endpoint per domain

    # 1. Prepare Data
    prepare_data_task = prepare_data_op(
        project_id=project,
        bucket_name=BUCKET_NAME, # Assuming BUCKET_NAME is globally available from config
        data_source_type=data_source_type,
        source_uri_or_query=source_uri_or_query,
        target_column=target_column
    ).set_display_name("Prepare Data")

    # 2. Train Model
    train_model_task = train_expert_model_op(
        train_data=prepare_data_task.outputs["output_train_uri"],
        input_scaler_uri=prepare_data_task.outputs["output_scaler_uri"],
        target_column=target_column,
        model_id_prefix=model_display_name_prefix, # Pass prefix
        model_version=run_timestamp, # Use run ID as version
        task_type=task_type,
        output_shape_json=output_shape_json,
        hyperparameters_json=hyperparameters_json,
        epochs=epochs,
        batch_size=batch_size
    ).set_display_name(f"Train Expert ({task_type}/{expert_domain})")

    # 3. Evaluate Model
    evaluate_model_task = evaluate_model_op(
        test_data=prepare_data_task.outputs["output_test_uri"],
        model=train_model_task.outputs["model_dir"],
        input_scaler_uri=prepare_data_task.outputs["output_scaler_uri"],
        target_column=target_column
    ).set_display_name("Evaluate Model")

    # 4. Conditional Deployment & Registration
    with dsl.Condition(
        evaluate_model_task.outputs["kfp_metrics"].outputs[deployment_threshold_metric] >= deployment_threshold_value,
        name="deploy-condition"
    ):
        # 4a. Upload Model to Vertex AI Model Registry
        model_upload_op = ModelUploadOp(
            project=project,
            location=location,
            display_name=model_display_name,
            artifact_uri=train_model_task.outputs["model_dir"].uri, # Use .uri for artifact path
            serving_container_image_uri=serving_image,
            labels={"miz_pipeline_run_id": run_timestamp, "miz_expert_domain": expert_domain, "miz_task_type": task_type}
        ).set_display_name("Upload Model")

        # 4b. Create or Get Endpoint (shared endpoint per domain)
        endpoint_create_op = EndpointCreateOp(
            project=project,
            location=location,
            display_name=endpoint_display_name, # Use consistent name for shared endpoint
            labels={"miz_app": "bgi_platform", "miz_domain": expert_domain}
        ).set_display_name(f"Create/Get Endpoint ({expert_domain})")

        # 4c. Deploy Model to Endpoint
        model_deploy_op = ModelDeployOp(
            project=project,
            endpoint=endpoint_create_op.outputs["endpoint"],
            model=model_upload_op.outputs["model"],
            deployed_model_display_name=model_display_name, # Unique name for this deployment
            machine_type=deploy_machine_type,
            traffic_split=json.loads(deploy_traffic_split_json) # Deploy with specified traffic
        ).set_display_name(f"Deploy Model ({deploy_traffic_split_json} Traffic)")

        # 4d. Update MoE Registry via API Call (using the refined component)
        # Ensure the endpoint parameter is passed correctly
        update_moe_task = update_moe_manager_op(
            project=project,
            location=location,
            expert_id=model_display_name, # Use the unique model display name as expert ID
            model_resource_name=model_upload_op.outputs["model"].resource_name, # Pass full resource name
            endpoint_resource_name=endpoint_create_op.outputs["endpoint"].resource_name, # Pass full resource name
            task_type=task_type,
            domain=expert_domain,
            metrics_json=evaluate_model_task.outputs["metrics_output_path"],
            moe_registry_api_endpoint=moe_registry_api_endpoint # Pass the API endpoint URL
        ).after(model_deploy_op).set_display_name("Update MoE Registry API") # Run after deployment

# --- Compile Pipeline ---
pipeline_filename = None
if MLOPS_CONFIG_LOADED:
    pipeline_filename = f"miz3_expert_training_pipeline_{TIMESTAMP}.json"
    try:
       if MOE_REGISTRY_ENDPOINT is None:
           logger.warning("MOE_REGISTRY_API_ENDPOINT is not set. The 'update_moe_manager_op' step will fail if included in execution.")
           # Optionally remove the MoE update step if endpoint is missing, or let it fail during execution
           # For compilation, we might need to pass a dummy value if the parameter is mandatory
           # However, the component logic handles the check, so compilation might proceed.

       # Use KFP v1 compiler explicitly
       v1_compiler.Compiler(mode=kfp.dsl.PipelineExecutionMode.V1_LEGACY).compile(
           pipeline_func=expert_training_pipeline_v1deploy_apireg,
           package_path=pipeline_filename
       )
       logger.info(f"Pipeline compiled successfully to {pipeline_filename}")
    except Exception as compile_e:
       logger.critical("Pipeline compilation failed!", exc_info=True)
       pipeline_filename = None
else:
    logger.critical("MLOps configuration failed to load. Skipping pipeline compilation.")


# --- Conceptual Class for Triggering/Monitoring Pipelines via MIZ OKI Events ---
class TrainingPipelineOrchestrator:
    """
    Conceptual service that listens to Pub/Sub triggers (MIZ OKI format)
    and launches/monitors Vertex AI Pipeline Jobs.
    """
    def __init__(self, config: EnhancedConfig, pipeline_client: Optional[Any] = None, pubsub_client: Optional[Any] = None):
        # In a real service, pipeline_client would be Vertex AI Pipelines client
        # pubsub_client would be Pub/Sub client
        self.config = config
        self.pipeline_client = pipeline_client # e.g., aiplatform.PipelineJob
        self.pubsub_client = pubsub_client # e.g., pubsub_v1.PublisherClient
        self.logger = logging.getLogger("MIZ-OKI.TrainingPipelineOrchestrator")
        if not self.pipeline_client: self.logger.warning("Vertex AI Pipeline client not provided.")
        if not self.pubsub_client: self.logger.warning("Pub/Sub client not provided.")

    async def handle_trigger_event(self, miz_oki_event: Dict) -> Dict:
        """Handles a MIZ OKI event requesting a pipeline launch."""
        trace_id = miz_oki_event.get("trace_id", f"mlops_trigger_{uuid.uuid4().hex[:8]}")
        response_status = "failed"; response_payload = None; errors = []
        try:
            if not self.pipeline_client: raise RuntimeError("Pipeline client unavailable.")
            event_payload = miz_oki_event.get("payload", {})
            pipeline_name = event_payload.get("pipeline_name")
            pipeline_params = event_payload.get("parameters", {})
            if not pipeline_name or not pipeline_params:
                raise ValueError("Missing 'pipeline_name' or 'parameters' in trigger event payload.")

            # --- TODO: Map pipeline_name to the compiled template path (e.g., from GCS) ---
            template_path = f"{PIPELINE_ROOT}/{pipeline_name}.json" # Example mapping
            if not template_path: raise ValueError(f"Cannot find template for pipeline: {pipeline_name}")
            # --- End TODO ---

            job_id = f"{pipeline_name}-run-{datetime.datetime.now(datetime.timezone.utc).strftime('%Y%m%d%H%M%S')}"
            logger.info(f"Submitting Vertex AI Pipeline job '{job_id}' from template '{template_path}' with params: {pipeline_params}")

            # Use Vertex AI SDK to submit the job (this part is synchronous in current SDK)
            # Needs to be run in a thread for async context
            def _submit_job():
                job = aiplatform.PipelineJob(
                    display_name=job_id,
                    template_path=template_path,
                    pipeline_root=PIPELINE_ROOT,
                    parameter_values=pipeline_params,
                    project=self.config.gcp.project_id,
                    location=self.config.gcp.region,
                    # enable_caching=True, # Optional
                )
                job.submit() # Submits the job
                return job.resource_name # Return the job resource name

            job_resource_name = await asyncio.to_thread(_submit_job)
            logger.info(f"Successfully submitted Vertex AI Pipeline job: {job_resource_name}")
            response_status = "submitted"; response_payload = {"job_resource_name": job_resource_name, "job_id_prefix": job_id}

        except Exception as e:
            logger.error(f"Pipeline launch error from trigger event: {e}", exc_info=True)
            errors.append({"code": "PIPELINE_SUBMIT_ERROR", "message": str(e)})
            response_status = "error"

        # Return a MIZ OKI response (optional, depends if caller needs confirmation)
        return {
            "miz_oki_version": self.config.miz_oki_schema_version,
            "request_id": miz_oki_event.get("request_id"), "trace_id": trace_id,
            "timestamp": datetime.now(datetime.timezone.utc).isoformat(),
            "source_component": "TrainingPipelineOrchestrator",
            "status": response_status, "payload": response_payload, "error_details": errors
        }

    async def handle_completion_event(self, miz_oki_event: Dict) -> Dict:
        """Handles Pub/Sub event for completed training, triggers post-processing/evaluation."""
        # This would typically be triggered by the pipeline itself publishing to a topic upon completion.
        # The event payload should contain job details (resource name, status, outputs).
        trace_id = miz_oki_event.get("trace_id", f"mlops_complete_{uuid.uuid4().hex[:8]}")
        response_status = "failed"; response_payload = None; errors = []
        try:
            job_details = miz_oki_event.get("payload", {})
            job_resource_name = job_details.get("job_resource_name")
            job_status = job_details.get("status") # e.g., 'PIPELINE_STATE_SUCCEEDED', 'PIPELINE_STATE_FAILED'
            if not job_resource_name or not job_status:
                raise ValueError("Missing job details in completion event.")

            logger.info(f"Handling pipeline completion event for job: {job_resource_name}, Status: {job_status}")

            if job_status == 'PIPELINE_STATE_SUCCEEDED':
                # --- TODO: Trigger next steps ---
                # - Call MoE Registry update (if not done in pipeline)
                # - Trigger evaluation workflows
                # - Update KG with model metadata
                # Example: Publish event for LI Tool
                # if self.pubsub_client:
                #     li_event = {...}
                #     await self.pubsub_client.publish(...)
                # --- End TODO ---
                response_status = "success"; response_payload = {"job_resource_name": job_resource_name, "action": "Post-processing triggered (placeholder)"}
            else:
                # Handle failed pipeline
                logger.error(f"Pipeline job {job_resource_name} failed. Status: {job_status}. Details: {job_details.get('error')}")
                response_status = "failed"; errors.append({"code": "PIPELINE_FAILED", "message": f"Job {job_resource_name} failed.", "details": job_details.get('error')})
                # --- TODO: Trigger alerting or remediation ---

        except Exception as e:
            logger.error(f"Pipeline completion handler error: {e}", exc_info=True)
            errors.append({"code": "HANDLER_ERROR", "message": str(e)})
            response_status = "error"

        return {
            "miz_oki_version": self.config.miz_oki_schema_version,
            "request_id": miz_oki_event.get("request_id"), "trace_id": trace_id,
            "timestamp": datetime.now(datetime.timezone.utc).isoformat(),
            "source_component": "TrainingPipelineOrchestrator",
            "status": response_status, "payload": response_payload, "error_details": errors
        }


# --- Notes on Execution ---
logger.info("--- MLOps Pipeline Execution Notes (Reworked) ---")
if MLOPS_CONFIG_LOADED and pipeline_filename:
    logger.info(f"1. Upload compiled pipeline: '{pipeline_filename}' to GCS bucket '{BUCKET_NAME}' (e.g., under {PIPELINE_ROOT}).")
    logger.info(f"2. Setup Cloud Function/Run service triggered by Pub/Sub topic: '{MLOPS_TRIGGER_TOPIC}'.")
    logger.info("3. The trigger service should instantiate TrainingPipelineOrchestrator and call handle_trigger_event.")
    logger.info(f"4. Ensure MoE Registry API ('{MOE_REGISTRY_ENDPOINT or 'endpoint_not_set'}') is deployed & pipeline SA has 'roles/run.invoker'.")
    logger.info("5. Implement REAL logic in pipeline components (prepare_data_op, train_expert_model_op, evaluate_model_op).")
    logger.info("6. Define SEPARATE KFP pipelines for LLM Fine-Tuning/Distillation (likely using CustomJobOp or specific GCPC components).")
else:
     logger.error("MLOps pipeline configuration incomplete or compilation failed. Deployment steps cannot be determined.")

print(f"\n--- MIZ 3.0 MLOps Pipeline Definition Compiled (Cell 17 - Reworked) ---")
if pipeline_filename: print(f"Pipeline definition saved to: {pipeline_filename}")
else: print("Pipeline compilation FAILED. Check logs.")
print("-----------------------------------------------------------------")

SyntaxError: non-default argument follows default argument (<ipython-input-20-287087a799ae>, line 178)

In [21]:
# Cell 18: Foundation Model Client (New Implementation)
# Status: Implements the unified client logic as a deployable Tool/Service.
#         Handles multiple providers (Vertex, OpenAI, Anthropic), async calls,
#         rate limiting, retries, caching, cost estimation, MIZ OKI I/O.

import logging
import asyncio
import json
import time
import uuid
import os
from typing import Dict, Any, Optional, List, Union, Tuple
from collections import defaultdict
from dataclasses import dataclass, field, asdict
import aiohttp # For async HTTP calls (e.g., Anthropic REST)
from cachetools import TTLCache # For response caching
from tenacity import retry, stop_after_attempt, wait_exponential, retry_if_exception_type, RetryError # For robust API calls

# --- Assume Real Config is Available ---
try:
    # Use CONFIG_OBJ loaded in Cell 1
    if 'CONFIG_OBJ' not in globals() or not CONFIG_OBJ: raise NameError("CONFIG_OBJ not found or is None")
    if not isinstance(CONFIG_OBJ, EnhancedConfig): raise NameError("CONFIG_OBJ is not an EnhancedConfig instance")
    _config_obj = CONFIG_OBJ
    _real_dependencies = True
    logger.debug("Using real CONFIG_OBJ in Cell 18 (New Implementation).")

    # --- Vertex AI SDK ---
    try:
        import vertexai
        from vertexai.generative_models import GenerativeModel, Part, FinishReason, Candidate, GenerationResponse
        from vertexai.language_models import TextEmbeddingModel, TextEmbeddingInput # Use specific embedding model class
        # Ensure initialized (from Cell 1)
        if not _config_obj.gcp.project_id or not _config_obj.gcp.region: raise Exception("GCP Project/Region missing")
        # Check if SDK is initialized (simple check)
        if not getattr(vertexai.preview.initializer.global_config, 'project', None):
             vertexai.init(project=_config_obj.gcp.project_id, location=_config_obj.gcp.region)
             logger.info("Vertex AI SDK initialized in FM Client.")
        VERTEX_SDK_AVAILABLE = True
    except ImportError:
        logger.warning("Vertex AI SDK not found. Vertex models unavailable.")
        VERTEX_SDK_AVAILABLE = False
        # Define dummy classes for type hinting if SDK missing
        class GenerativeModel: pass; class TextEmbeddingModel: pass; class Part: pass; class FinishReason: pass; class Candidate: pass; class GenerationResponse: pass; class TextEmbeddingInput: pass

    # --- OpenAI SDK ---
    try:
        from openai import AsyncOpenAI, RateLimitError as OpenAIRateLimitError, APIError as OpenAIAPIError
        OPENAI_SDK_AVAILABLE = True
    except ImportError:
        logger.warning("OpenAI SDK not found.")
        OPENAI_SDK_AVAILABLE = False
        class AsyncOpenAI: pass; class OpenAIRateLimitError(Exception): pass; class OpenAIAPIError(Exception): pass

    # --- Anthropic SDK ---
    # Using REST via aiohttp for simplicity, but check if SDK is installed for completeness
    try:
        import anthropic
        # Check for specific exceptions if using SDK directly
        # from anthropic import RateLimitError as AnthropicRateLimitError, APIError as AnthropicAPIError
        ANTHROPIC_SDK_AVAILABLE = True
    except ImportError:
        logger.warning("Anthropic SDK not found. Using REST API via aiohttp.")
        ANTHROPIC_SDK_AVAILABLE = False
        class anthropic: pass # Dummy class

except NameError as e:
    logger.warning(f"Dependency Error in Cell 18 ({e}). Using Mocks/Placeholders.")
    _real_dependencies = False
    # --- Mock Config ---
    from dataclasses import dataclass, field
    @dataclass class MockFmDefaults: vertex: str = "gemini-1.5-flash-001"; llama4_scout: str = "mock-scout"; llama4_maverick: str = "mock-mav"; llama4_embedding_model: str = "mock-emb"; openai: str = "mock-gpt"; anthropic: str = "mock-claude"
    @dataclass class MockFmPricing: prompt: float = 0.5; completion: float = 1.5
    @dataclass class MockFmConfig: keys: Dict = field(default_factory=lambda: {"vertex": "auth", "openai": "sk-mock", "anthropic": "ak-mock"}); defaults: MockFmDefaults = field(default_factory=MockFmDefaults); pricing: Dict = field(default_factory=lambda: {"vertex": {"gemini-1.5-flash-001": MockFmPricing()}, "openai": {"mock-gpt": MockFmPricing()}, "anthropic": {"mock-claude": MockFmPricing()}})
    @dataclass class MockConfig: foundation_models: MockFmConfig = field(default_factory=MockFmConfig); miz_oki_schema_version: str = "3.0"; def get_model_info(self, alias): return {"provider": "mock", "model_id": alias, "pricing": {"prompt": 0.1, "completion": 0.2}}
    _config_obj = MockConfig()
    VERTEX_SDK_AVAILABLE = False; OPENAI_SDK_AVAILABLE = False; ANTHROPIC_SDK_AVAILABLE = False
    # --- End Mock Setup ---

logger = logging.getLogger('MIZ-OKI.FoundationModelClient')

# --- Helper Classes ---
class RateLimiter:
    """Simple token bucket rate limiter for async operations."""
    def __init__(self, rate: float, capacity: float):
        """
        Args:
            rate (float): Tokens added per second.
            capacity (float): Maximum tokens in the bucket.
        """
        if rate <= 0 or capacity <= 0:
            raise ValueError("Rate and capacity must be positive")
        self.rate = rate
        self.capacity = capacity
        self._tokens = capacity
        self._last_update = time.monotonic()
        self._lock = asyncio.Lock()

    async def wait(self):
        """Waits if necessary to ensure rate limit is not exceeded."""
        async with self._lock:
            now = time.monotonic()
            elapsed = now - self._last_update
            self._last_update = now
            # Add tokens earned during elapsed time
            self._tokens = min(self.capacity, self._tokens + elapsed * self.rate)

            if self._tokens < 1:
                # Calculate wait time needed to get 1 token
                wait_time = (1 - self._tokens) / self.rate
                logger.debug(f"Rate limiter: Waiting for {wait_time:.3f} seconds...")
                await asyncio.sleep(wait_time)
                # Update tokens after waiting
                self._tokens += wait_time * self.rate
                self._last_update = time.monotonic() # Update last_update after sleep

            # Consume one token
            self._tokens -= 1
            # logger.debug(f"Rate limiter: Token consumed. Remaining: {self._tokens:.2f}")

# Define common retryable exceptions for API calls
# Add provider-specific exceptions if needed and available
RETRYABLE_HTTP_STATUS = {429, 500, 502, 503, 504} # Too Many Requests, Server Errors
RETRYABLE_EXCEPTIONS = (
    aiohttp.ClientConnectionError, # Network issues
    aiohttp.ClientPayloadError,    # Issues with response payload
    asyncio.TimeoutError,          # Request timeout
    # OpenAI specific (if SDK used)
    OpenAIRateLimitError if OPENAI_SDK_AVAILABLE else ConnectionError, # Placeholder if SDK missing
    OpenAIAPIError if OPENAI_SDK_AVAILABLE else ConnectionError, # General OpenAI API errors (some might be retryable)
    # Anthropic specific (if SDK used) - Add specific exceptions here
    # Vertex AI specific - SDK might raise google.api_core.exceptions.RetryError or ServiceUnavailable
    gcp_exceptions.RetryError if GCP_SDK_AVAILABLE else ConnectionError,
    gcp_exceptions.ServiceUnavailable if GCP_SDK_AVAILABLE else ConnectionError,
)

def is_retryable_exception(exception: BaseException) -> bool:
    """Checks if an exception is retryable."""
    if isinstance(exception, aiohttp.ClientResponseError):
        return exception.status in RETRYABLE_HTTP_STATUS
    return isinstance(exception, RETRYABLE_EXCEPTIONS)

# --- Foundation Model Client Implementation ---
class FoundationModelClient:
    """Unified client for interacting with various Foundation Models via API. Deployed as a service."""
    def __init__(self, config: EnhancedConfig):
        if not config: raise InitializationError("Config required for FoundationModelClient.")
        self.config = config
        # Cache for storing responses (key: hash of request, value: response payload)
        self.cache = TTLCache(maxsize=500, ttl=300) # 5 min TTL cache
        self.logger = logging.getLogger('MIZ-OKI.FoundationModelClient')
        # Rate limiters per provider (adjust rates based on actual limits/tiers)
        self.rate_limiters = {
            'vertex': RateLimiter(rate=100/60, capacity=100), # Example: 100 req/min
            'openai': RateLimiter(rate=60/60, capacity=60),   # Example: 60 req/min
            'anthropic': RateLimiter(rate=5, capacity=10),    # Example: 5 req/sec
            'mock': RateLimiter(rate=100, capacity=100),      # High rate for mocks
        }
        self.clients: Dict[str, Any] = {} # Stores initialized SDK clients or config
        self.session: Optional[aiohttp.ClientSession] = None
        self.initialized = False
        self.metrics = defaultdict(lambda: defaultdict(int)) # Track calls, errors, tokens per model

    async def initialize(self):
        """Initialize HTTP session and SDK clients asynchronously."""
        if self.initialized: return
        self.logger.info("Initializing FoundationModelClient...")
        self.session = aiohttp.ClientSession(timeout=aiohttp.ClientTimeout(total=120)) # 120s total timeout

        # Initialize Vertex AI (if SDK available and configured)
        if VERTEX_SDK_AVAILABLE and 'vertex' in self.config.foundation_models.keys:
            try:
                # SDK should be initialized in Cell 1, just store references or perform checks
                # Store model instances directly? Or just check availability? Storing for now.
                default_vertex_model = self.config.foundation_models.defaults.vertex
                default_embedding_model = self.config.foundation_models.defaults.llama4_embedding_model
                self.clients['vertex_genai'] = GenerativeModel(default_vertex_model) # Store default model instance
                self.clients['vertex_embedding'] = TextEmbeddingModel.from_pretrained(default_embedding_model)
                logger.info(f"Vertex AI clients initialized (GenAI: {default_vertex_model}, Embedding: {default_embedding_model}).")
            except Exception as e: logger.error(f"Vertex AI client initialization failed: {e}", exc_info=True)

        # Initialize OpenAI
        if OPENAI_SDK_AVAILABLE and 'openai' in self.config.foundation_models.keys:
            try:
                self.clients['openai'] = AsyncOpenAI(api_key=self.config.foundation_models.keys['openai'], max_retries=0) # Handle retries manually with tenacity
                logger.info("OpenAI client initialized.")
            except Exception as e: logger.error(f"OpenAI client init failed: {e}", exc_info=True)

        # Store Anthropic config (using REST via aiohttp)
        if 'anthropic' in self.config.foundation_models.keys:
            self.clients['anthropic'] = {"api_key": self.config.foundation_models.keys['anthropic'], "base_url": "https://api.anthropic.com/v1"}
            logger.info("Anthropic client config stored (using REST).")

        # Add mock client if needed for testing unresolved models
        self.clients['mock'] = {"status": "mock"}

        self.initialized = True
        self.logger.info("FoundationModelClient initialized.")

    async def cleanup(self):
        """Close aiohttp session and potentially other client resources."""
        if self.session:
            await self.session.close()
            self.session = None
        # Close SDK clients if they have explicit close methods (OpenAI doesn't typically)
        self.clients = {}
        self.cache.clear()
        self.initialized = False
        self.logger.info("FoundationModelClient cleaned up.")

    def _get_client_and_model(self, model_alias_or_id: str) -> Tuple[Optional[Any], Optional[str], Optional[str]]:
        """Resolves alias, gets provider client/config and actual model ID."""
        model_info = self.config.get_model_info(model_alias_or_id)
        if not model_info:
            self.logger.error(f"Could not resolve model info for alias/ID: '{model_alias_or_id}'")
            return None, None, None

        provider = model_info["provider"]
        model_id = model_info["model_id"] # The actual ID used by the provider

        # Get the client or config associated with the provider
        client_or_config = self.clients.get(provider)

        # Special handling for Vertex AI which might use different client objects
        if provider == 'vertex':
            # Determine if it's an embedding or generative task based on model ID (simple check)
            if 'embedding' in model_id.lower(): client_or_config = self.clients.get('vertex_embedding')
            else: client_or_config = self.clients.get('vertex_genai')

        if client_or_config is None:
             self.logger.error(f"Client/Config for provider '{provider}' not initialized or key missing.")
             return None, None, provider # Return provider even if client is missing

        return client_or_config, model_id, provider

    def _estimate_cost(self, provider: str, model_id: str, prompt_tokens: int, completion_tokens: int) -> Optional[float]:
        """Estimates cost based on token counts and config pricing."""
        try:
            pricing_info = self.config.foundation_models.pricing.get(provider, {}).get(model_id)
            if pricing_info:
                cost = (prompt_tokens / 1_000_000 * pricing_info.prompt) + (completion_tokens / 1_000_000 * pricing_info.completion)
                return round(cost, 6) # Return cost rounded to 6 decimal places
        except Exception as e:
            logger.warning(f"Cost estimation failed for {provider}/{model_id}: {e}")
        return None

    def _update_metrics(self, model_alias_or_id: str, status: str, duration: float, p_tokens: int = 0, c_tokens: int = 0):
        """Updates call metrics."""
        self.metrics[model_alias_or_id][status] += 1
        self.metrics[model_alias_or_id]['total_duration'] = self.metrics[model_alias_or_id].get('total_duration', 0) + duration
        self.metrics[model_alias_or_id]['prompt_tokens'] = self.metrics[model_alias_or_id].get('prompt_tokens', 0) + p_tokens
        self.metrics[model_alias_or_id]['completion_tokens'] = self.metrics[model_alias_or_id].get('completion_tokens', 0) + c_tokens

    def _handle_error(self, model_alias_or_id: str, error: Exception):
        """Handles errors during API calls."""
        self._update_metrics(model_alias_or_id, 'error', 0) # Record error count
        # Log specific error types if needed
        logger.error(f"API call failed for {model_alias_or_id}: {type(error).__name__} - {error}")

    def _generate_cache_key(self, model_alias: str, **kwargs) -> str:
        """Generates a cache key based on model and relevant arguments."""
        key_dict = {"model": model_alias, **kwargs}
        # Convert complex objects like lists/dicts to sorted tuples/items for consistent hashing
        def make_hashable(o):
            if isinstance(o, dict): return tuple(sorted((k, make_hashable(v)) for k, v in o.items()))
            if isinstance(o, (list, tuple)): return tuple(make_hashable(e) for e in o)
            return o
        try:
            hashable_key = make_hashable(key_dict)
            return str(hash(hashable_key))
        except Exception as e:
             logger.warning(f"Could not generate hashable cache key, using simple string: {e}")
             # Fallback to less reliable string representation
             return json.dumps(key_dict, sort_keys=True, default=str)


    # --- Provider Specific Methods ---

    @retry(stop=stop_after_attempt(3), wait=wait_exponential(multiplier=1, min=2, max=10), retry=retry_if_exception_type(is_retryable_exception))
    async def _vertex_generate(self, client_instance: GenerativeModel, model_id: str, prompt: str, max_tokens: int, temperature: float, **kwargs) -> Dict:
        """Generates text using Vertex AI GenerativeModel."""
        if not VERTEX_SDK_AVAILABLE: raise RuntimeError("Vertex AI SDK not available.")
        try:
            # Ensure we're using the correct model instance if client_instance is just the default
            if client_instance.model_name != model_id:
                model_instance = GenerativeModel(model_id)
                logger.debug(f"Using specific Vertex model instance: {model_id}")
            else:
                model_instance = client_instance # Use the default instance passed

            generation_config = {
                "max_output_tokens": max_tokens,
                "temperature": temperature,
                "top_p": kwargs.get("top_p", 0.95)
            }
            # Vertex SDK generate_content is sync, run in thread
            # Use generate_content_async if available and preferred
            if hasattr(model_instance, 'generate_content_async'):
                 response: GenerationResponse = await model_instance.generate_content_async(
                     prompt, generation_config=generation_config
                 )
            else:
                 response: GenerationResponse = await asyncio.to_thread(
                     model_instance.generate_content,
                     prompt, generation_config=generation_config
                 )

            # Extract text, handling potential lack of content or errors
            text_response = ""
            if response.candidates:
                first_candidate = response.candidates[0]
                if first_candidate.content and first_candidate.content.parts:
                    text_response = "".join(part.text for part in first_candidate.content.parts if hasattr(part, 'text'))
                # Check finish reason for safety issues
                if first_candidate.finish_reason not in [FinishReason.STOP, FinishReason.MAX_TOKENS, None]: # Allow None for safety
                     raise RuntimeError(f"Vertex AI generation stopped due to safety or other reason: {first_candidate.finish_reason.name}")
            else:
                 # Handle cases where response might be blocked or empty
                 logger.warning(f"Vertex AI response for {model_id} has no candidates. Response: {response}")
                 # Check for prompt feedback if available
                 if response.prompt_feedback and response.prompt_feedback.block_reason:
                      raise RuntimeError(f"Vertex AI prompt blocked: {response.prompt_feedback.block_reason.name}")


            # Token count might be in usage_metadata
            prompt_tokens = response.usage_metadata.prompt_token_count if hasattr(response, 'usage_metadata') else 0
            completion_tokens = response.usage_metadata.candidates_token_count if hasattr(response, 'usage_metadata') else 0
            return {"text": text_response, "prompt_tokens": prompt_tokens, "completion_tokens": completion_tokens}
        except Exception as e:
            logger.error(f"Vertex AI generation error for {model_id}: {e}", exc_info=True)
            raise # Re-raise for tenacity retry

    @retry(stop=stop_after_attempt(3), wait=wait_exponential(multiplier=1, min=2, max=10), retry=retry_if_exception_type(is_retryable_exception))
    async def _vertex_embed(self, client_instance: TextEmbeddingModel, model_id: str, data: Union[str, List[str]], **kwargs) -> Dict:
        """Generates embeddings using Vertex AI TextEmbeddingModel."""
        if not VERTEX_SDK_AVAILABLE: raise RuntimeError("Vertex AI SDK not available.")
        try:
            # Ensure we're using the correct model instance if client_instance is just the default
            # Note: TextEmbeddingModel.from_pretrained might be sync, handle if needed
            if client_instance._model_id != model_id: # Accessing private attr might be fragile
                embedding_client = TextEmbeddingModel.from_pretrained(model_id)
                logger.debug(f"Using specific Vertex embedding instance: {model_id}")
            else:
                embedding_client = client_instance # Use the default instance passed

            # The SDK method handles batching automatically
            instances = [data] if isinstance(data, str) else data
            # Prepare input objects
            embedding_inputs = [TextEmbeddingInput(text=text) for text in instances]

            # get_embeddings is sync, run in thread
            embeddings_response = await asyncio.to_thread(embedding_client.get_embeddings, embedding_inputs)

            embeddings = [e.values for e in embeddings_response]
            # Token count estimation for embeddings is tricky, often priced per 1k chars or per request
            # Placeholder: estimate based on char count (crude)
            total_chars = sum(len(text) for text in instances)
            prompt_tokens = total_chars // 4 # Very rough estimate
            return {"embeddings": embeddings, "prompt_tokens": prompt_tokens, "completion_tokens": 0}
        except Exception as e:
            logger.error(f"Vertex AI embedding error for {model_id}: {e}", exc_info=True)
            raise # Re-raise for tenacity retry

    @retry(stop=stop_after_attempt(3), wait=wait_exponential(multiplier=1, min=2, max=10), retry=retry_if_exception_type(is_retryable_exception))
    async def _openai_generate(self, client: AsyncOpenAI, model_id: str, prompt: str, max_tokens: int, temperature: float, **kwargs) -> Dict:
        """Generates text using OpenAI API."""
        if not OPENAI_SDK_AVAILABLE: raise RuntimeError("OpenAI SDK not available.")
        try:
            response = await client.chat.completions.create(
                model=model_id,
                messages=[{"role": "user", "content": prompt}],
                max_tokens=max_tokens,
                temperature=temperature,
                top_p=kwargs.get("top_p", 1.0)
                # Add other OpenAI specific params if needed
            )
            text_response = response.choices[0].message.content if response.choices else ""
            prompt_tokens = response.usage.prompt_tokens if response.usage else 0
            completion_tokens = response.usage.completion_tokens if response.usage else 0
            return {"text": text_response, "prompt_tokens": prompt_tokens, "completion_tokens": completion_tokens}
        except Exception as e:
            logger.error(f"OpenAI generation error for {model_id}: {e}", exc_info=True)
            raise # Re-raise for tenacity retry

    @retry(stop=stop_after_attempt(3), wait=wait_exponential(multiplier=1, min=2, max=10), retry=retry_if_exception_type(is_retryable_exception))
    async def _openai_embed(self, client: AsyncOpenAI, model_id: str, data: Union[str, List[str]], **kwargs) -> Dict:
        """Generates embeddings using OpenAI API."""
        if not OPENAI_SDK_AVAILABLE: raise RuntimeError("OpenAI SDK not available.")
        try:
            # OpenAI API expects 'input' which can be str or list[str]
            response = await client.embeddings.create(model=model_id, input=data)
            embeddings = [item.embedding for item in response.data]
            prompt_tokens = response.usage.prompt_tokens if response.usage else 0
            # OpenAI embedding response format might vary, adjust parsing as needed
            # If input was single string, return single embedding list
            final_embeddings = embeddings[0] if isinstance(data, str) and len(embeddings) == 1 else embeddings
            return {"embeddings": final_embeddings, "prompt_tokens": prompt_tokens, "completion_tokens": 0}
        except Exception as e:
            logger.error(f"OpenAI embedding error for {model_id}: {e}", exc_info=True)
            raise # Re-raise for tenacity retry

    @retry(stop=stop_after_attempt(3), wait=wait_exponential(multiplier=1, min=2, max=10), retry=retry_if_exception_type(is_retryable_exception))
    async def _anthropic_generate(self, client_config: Dict, model_id: str, prompt: str, max_tokens: int, temperature: float, **kwargs) -> Dict:
        """Generates text using Anthropic API (REST via aiohttp)."""
        if not self.session: raise RuntimeError("Aiohttp session not initialized.")
        headers = {
            "x-api-key": client_config["api_key"],
            "anthropic-version": "2023-06-01", # Use appropriate API version
            "content-type": "application/json"
        }
        # Use messages format
        payload = {
            "model": model_id,
            "messages": [{"role": "user", "content": prompt}],
            "max_tokens": max_tokens,
            "temperature": temperature,
            "top_p": kwargs.get("top_p", 1.0)
            # Add other Anthropic specific params if needed
        }
        url = f"{client_config['base_url'].rstrip('/')}/messages"
        try:
            async with self.session.post(url, headers=headers, json=payload) as response:
                if response.status >= 400: # Check for HTTP errors
                    error_text = await response.text()
                    logger.error(f"Anthropic API HTTP error {response.status}: {error_text}")
                    response.raise_for_status() # Raise ClientResponseError

                result = await response.json()
                # Extract text from content blocks
                text_response = "".join([block.get("text", "") for block in result.get("content", []) if block.get("type") == "text"])
                # Token counts from usage field
                prompt_tokens = result.get("usage", {}).get("input_tokens", 0)
                completion_tokens = result.get("usage", {}).get("output_tokens", 0)
                return {"text": text_response, "prompt_tokens": prompt_tokens, "completion_tokens": completion_tokens}
        except aiohttp.ClientResponseError as http_err:
            # Already logged above, just re-raise for tenacity
            raise http_err
        except Exception as e:
            logger.error(f"Anthropic generation error for {model_id}: {e}", exc_info=True)
            raise # Re-raise for tenacity retry

    # --- Unified Public Methods (Accepting/Returning MIZ OKI Payloads) ---

    async def generate_text(self, input_data: Dict[str, Any]) -> Dict[str, Any]:
        """Generates text using the specified model. Expects/Returns MIZ OKI."""
        start_time = time.monotonic(); errors = []
        # Parse MIZ OKI input
        payload = input_data.get("payload", {}); model_alias = payload.get("model_alias"); prompt = payload.get("prompt"); max_tokens = payload.get("max_tokens", 1024); temperature = payload.get("temperature", 0.7); kwargs = payload.get("kwargs", {})
        trace_id = input_data.get("trace_id"); request_id = input_data.get("request_id")

        # --- MIZ OKI Response Setup ---
        response = {
            "miz_oki_version": self.config.miz_oki_schema_version,
            "request_id": request_id, "trace_id": trace_id,
            "workflow_execution_id": input_data.get("workflow_execution_id"), "step_id": input_data.get("step_id"),
            "timestamp": datetime.now(datetime.timezone.utc).isoformat(),
            "source_component": "FoundationModelClient", "target_component": input_data.get("source_component"),
            "status": "pending", "payload": None, "error_details": None, "metadata": {}
        }

        if not self.initialized: errors.append({"code": "NOT_INITIALIZED", "message": "FM Client not initialized."})
        if not model_alias or not prompt: errors.append({"code": "MISSING_DATA", "message": "'payload.model_alias' and 'payload.prompt' required."})
        if errors:
            response["status"] = "bad_request"; response["error_details"] = errors
            response["metadata"]["processing_duration_ms"] = (time.monotonic() - start_time) * 1000
            return response

        status = "pending"; response_payload = None; response_metadata = {}

        # --- Caching ---
        cache_key = self._generate_cache_key(model_alias=model_alias, prompt=prompt, max_tokens=max_tokens, temperature=temperature, **kwargs)
        if cached := self.cache.get(cache_key):
            response["status"] = "success"; response["payload"] = cached
            response["metadata"] = {"cached": True, "processing_duration_ms": (time.monotonic() - start_time) * 1000}
            logger.debug(f"Cache hit for text generation: {model_alias}")
            self._update_metrics(model_alias, 'cache_hit', 0)
            return response
        # --- End Caching ---

        client, model_id, provider = self._get_client_and_model(model_alias)
        if not provider or not model_id: errors.append({"code": "MODEL_RESOLUTION_FAILED", "message": f"Cannot resolve model '{model_alias}'."})
        # Allow provider='vertex' even if client is None initially, specific methods handle client selection
        if not client and provider != 'vertex': errors.append({"code": "CLIENT_UNAVAILABLE", "message": f"Client for provider '{provider}' unavailable."})
        if errors:
            response["status"] = "config_error"; response["error_details"] = errors
            response["metadata"]["processing_duration_ms"] = (time.monotonic() - start_time) * 1000
            return response

        try:
            await self.rate_limiters[provider].wait() # Wait for rate limit
            provider_method = None
            if provider == 'vertex': provider_method = self._vertex_generate
            elif provider == 'openai': provider_method = self._openai_generate
            elif provider == 'anthropic': provider_method = self._anthropic_generate
            elif provider == 'mock': # Handle mock provider for testing
                 provider_method = lambda c, m, p, mx, t, **kw: asyncio.sleep(0.01, result={"text": "Mock response", "prompt_tokens": 10, "completion_tokens": 5})
            # Add other providers...

            if not provider_method: raise NotImplementedError(f"Generation not implemented for provider: {provider}")

            # Call the provider-specific method with retry logic handled by tenacity
            result = await provider_method(client, model_id, prompt, max_tokens, temperature, **kwargs)

            duration = time.monotonic() - start_time
            cost = self._estimate_cost(provider, model_id, result["prompt_tokens"], result["completion_tokens"])
            status = "success"
            response_payload = {"generated_text": result["text"]}
            response_metadata = {"provider": provider, "model_id": model_id, "duration_ms": duration * 1000, "prompt_tokens": result["prompt_tokens"], "completion_tokens": result["completion_tokens"], "estimated_cost_usd": cost, "cached": False}
            self.cache[cache_key] = response_payload # Cache successful result payload
            self._update_metrics(model_alias, 'success', duration, result["prompt_tokens"], result["completion_tokens"])

        except RetryError as retry_err: # Catch tenacity retry error
             status = "failed"; errors.append({"code": "API_RETRY_ERROR", "message": f"API call failed after multiple retries: {retry_err.last_attempt.exception()}"}); self._handle_error(model_alias, retry_err.last_attempt.exception()); logger.error(f"Error generating text with {model_alias} after retries: {retry_err.last_attempt.exception()}", exc_info=True)
        except Exception as e:
             status = "internal_error"; errors.append({"code": "GENERATION_ERROR", "message": str(e)}); self._handle_error(model_alias, e); logger.error(f"Error generating text with {model_alias}: {e}", exc_info=True)

        response["status"] = status
        response["payload"] = response_payload
        response["metadata"] = response_metadata
        response["metadata"]["total_processing_duration_ms"] = (time.monotonic() - start_time) * 1000 # Overwrite duration with total time
        if errors: response["error_details"] = errors
        return response

    async def generate_embedding(self, input_data: Dict[str, Any]) -> Dict[str, Any]:
        """Generates embeddings. Expects/Returns MIZ OKI."""
        start_time = time.monotonic(); errors = []
        # Parse MIZ OKI input
        payload = input_data.get("payload", {}); model_alias = payload.get("model_alias"); data = payload.get("data") # data can be str or List[str]
        trace_id = input_data.get("trace_id"); request_id = input_data.get("request_id")

        # --- MIZ OKI Response Setup ---
        response = {
            "miz_oki_version": self.config.miz_oki_schema_version,
            "request_id": request_id, "trace_id": trace_id,
            "workflow_execution_id": input_data.get("workflow_execution_id"), "step_id": input_data.get("step_id"),
            "timestamp": datetime.now(datetime.timezone.utc).isoformat(),
            "source_component": "FoundationModelClient", "target_component": input_data.get("source_component"),
            "status": "pending", "payload": None, "error_details": None, "metadata": {}
        }

        if not self.initialized: errors.append({"code": "NOT_INITIALIZED", "message": "FM Client not initialized."})
        if data is None: errors.append({"code": "MISSING_DATA", "message": "'payload.data' (str or List[str]) required."})
        if errors:
            response["status"] = "bad_request"; response["error_details"] = errors
            response["metadata"]["processing_duration_ms"] = (time.monotonic() - start_time) * 1000
            return response

        model_alias = model_alias or self.config.foundation_models.defaults.llama4_embedding_model
        status = "pending"; response_payload = None; response_metadata = {}

        # --- Caching (Less effective for embeddings unless exact string match) ---
        # cache_key = self._generate_cache_key(model_alias=model_alias, data=data)
        # if cached := self.cache.get(cache_key): ...
        # --- End Caching ---

        client, model_id, provider = self._get_client_and_model(model_alias)
        if not provider or not model_id: errors.append({"code": "MODEL_RESOLUTION_FAILED", "message": f"Cannot resolve model '{model_alias}'."})
        if not client and provider != 'vertex': errors.append({"code": "CLIENT_UNAVAILABLE", "message": f"Client for provider '{provider}' unavailable."})
        if errors:
            response["status"] = "config_error"; response["error_details"] = errors
            response["metadata"]["processing_duration_ms"] = (time.monotonic() - start_time) * 1000
            return response

        try:
            await self.rate_limiters[provider].wait()
            provider_method = None
            if provider == 'vertex': provider_method = self._vertex_embed
            elif provider == 'openai': provider_method = self._openai_embed
            # Add other providers...
            elif provider == 'mock': # Handle mock provider for testing
                 provider_method = lambda c, m, d, **kw: asyncio.sleep(0.01, result={"embeddings": [[0.1]*10] if isinstance(d, list) else [0.1]*10, "prompt_tokens": 10, "completion_tokens": 0})

            if not provider_method: raise NotImplementedError(f"Embedding not implemented for provider: {provider}")

            # Call the provider-specific method with retry logic handled by tenacity
            result = await provider_method(client, model_id, data)

            duration = time.monotonic() - start_time
            cost = self._estimate_cost(provider, model_id, result["prompt_tokens"], result["completion_tokens"])
            status = "success"
            # Return single embedding if input was single string, else list
            response_payload = {"embedding": result["embeddings"]} # Keep as list consistently
            response_metadata = {"provider": provider, "model_id": model_id, "duration_ms": duration * 1000, "prompt_tokens": result["prompt_tokens"], "estimated_cost_usd": cost, "cached": False}
            # self.cache[cache_key] = response_payload # Cache successful result payload
            self._update_metrics(model_alias, 'success', duration, result["prompt_tokens"], result["completion_tokens"])

        except RetryError as retry_err: # Catch tenacity retry error
             status = "failed"; errors.append({"code": "API_RETRY_ERROR", "message": f"API call failed after multiple retries: {retry_err.last_attempt.exception()}"}); self._handle_error(model_alias, retry_err.last_attempt.exception()); logger.error(f"Error generating embedding with {model_alias} after retries: {retry_err.last_attempt.exception()}", exc_info=True)
        except Exception as e:
             status = "internal_error"; errors.append({"code": "EMBEDDING_ERROR", "message": str(e)}); self._handle_error(model_alias, e); logger.error(f"Error generating embedding with {model_alias}: {e}", exc_info=True)

        response["status"] = status
        response["payload"] = response_payload
        response["metadata"] = response_metadata
        response["metadata"]["total_processing_duration_ms"] = (time.monotonic() - start_time) * 1000
        if errors: response["error_details"] = errors
        return response

    async def analyze(self, input_data: Dict[str, Any]) -> Dict[str, Any]:
        """Performs analysis (e.g., sentiment) using an appropriate model. Expects/Returns MIZ OKI."""
        start_time = time.monotonic(); errors = []
        # Parse MIZ OKI input
        payload = input_data.get("payload", {}); model_alias = payload.get("model_alias"); text = payload.get("text"); analysis_type = payload.get("analysis_type", "sentiment") # Default to sentiment
        trace_id = input_data.get("trace_id"); request_id = input_data.get("request_id")

        # --- MIZ OKI Response Setup ---
        response = { # ... standard MIZ OKI response structure ...
            "source_component": "FoundationModelClient", "target_component": input_data.get("source_component"),
            "status": "pending", "payload": None, "error_details": None, "metadata": {}
        }

        if not self.initialized: errors.append({"code": "NOT_INITIALIZED", "message": "FM Client not initialized."})
        if not text: errors.append({"code": "MISSING_DATA", "message": "'payload.text' required."})
        # --- TODO: Add model alias resolution and client check similar to generate_text ---
        model_alias = model_alias or self.config.foundation_models.defaults.feedback_analyzer_model # Example: use feedback model
        client, model_id, provider = self._get_client_and_model(model_alias)
        if not provider or not model_id or (not client and provider != 'vertex'): errors.append({"code": "MODEL_UNAVAILABLE", "message": f"Model/Client for '{model_alias}' unavailable."})
        # --- End TODO ---
        if errors:
            response["status"] = "bad_request"; response["error_details"] = errors
            response["metadata"]["processing_duration_ms"] = (time.monotonic() - start_time) * 1000
            return response

        status = "pending"; response_payload = None; response_metadata = {}
        try:
            # --- TODO: Implement analysis logic using appropriate provider method ---
            # This might involve constructing a specific prompt for generate_text
            # or calling a dedicated analysis endpoint if available.
            # Example using generate_text for sentiment:
            prompt = f"Analyze the sentiment of the following text (positive, negative, neutral):\nText: {text}\nSentiment:"
            gen_request = {
                "payload": {"prompt": prompt, "model_alias": model_alias, "max_tokens": 10, "temperature": 0.1},
                "trace_id": trace_id, "request_id": f"fm_analyze_{request_id}"
            }
            gen_response = await self.generate_text(input_data=gen_request) # Call internal generate_text

            if gen_response.get("status") == "success":
                analysis_result = gen_response.get("payload", {}).get("generated_text", "").strip().lower()
                status = "success"
                response_payload = {"analysis_type": analysis_type, "result": analysis_result} # Simple result
                response_metadata = gen_response.get("metadata", {}) # Inherit metadata
            else:
                status = "failed"; errors = gen_response.get("error_details")
            # --- End TODO ---

        except Exception as e:
             status = "internal_error"; errors.append({"code": "ANALYSIS_ERROR", "message": str(e)}); logger.error(f"Error during analysis with {model_alias}: {e}", exc_info=True)

        response["status"] = status
        response["payload"] = response_payload
        response["metadata"] = response_metadata
        response["metadata"]["total_processing_duration_ms"] = (time.monotonic() - start_time) * 1000
        if errors: response["error_details"] = errors
        return response

    async def extract_kg_data_from_content(self, input_data: Dict[str, Any]) -> Dict[str, Any]:
        """Extracts KG entities/relationships using an LLM. Expects/Returns MIZ OKI."""
        start_time = time.monotonic(); errors = []
        # Parse MIZ OKI input
        payload = input_data.get("payload", {}); model_alias = payload.get("model_alias"); content = payload.get("content"); data_type = payload.get("data_type"); context = payload.get("context", {})
        trace_id = input_data.get("trace_id"); request_id = input_data.get("request_id")

        # --- MIZ OKI Response Setup ---
        response = { # ... standard MIZ OKI response structure ...
            "source_component": "FoundationModelClient", "target_component": input_data.get("source_component"),
            "status": "pending", "payload": None, "error_details": None, "metadata": {}
        }

        if not self.initialized: errors.append({"code": "NOT_INITIALIZED", "message": "FM Client not initialized."})
        if not content: errors.append({"code": "MISSING_DATA", "message": "'payload.content' required."})
        # --- TODO: Add model alias resolution and client check ---
        model_alias = model_alias or self.config.foundation_models.defaults.llama4_maverick # Use powerful model
        client, model_id, provider = self._get_client_and_model(model_alias)
        if not provider or not model_id or (not client and provider != 'vertex'): errors.append({"code": "MODEL_UNAVAILABLE", "message": f"Model/Client for '{model_alias}' unavailable."})
        # --- End TODO ---
        if errors:
            response["status"] = "bad_request"; response["error_details"] = errors
            response["metadata"]["processing_duration_ms"] = (time.monotonic() - start_time) * 1000
            return response

        status = "pending"; response_payload = None; response_metadata = {}
        try:
            # --- TODO: Construct a robust prompt for KG extraction ---
            # Include instructions, desired output format (JSON), examples, context.
            prompt = f"""Extract structured Knowledge Graph data (entities and relationships) from the following content.
Content Type: {data_type or 'Unknown'}
Context: {json.dumps(context, default=str)}
Content:
---
{content}
---
Output ONLY a JSON object with two keys: "entities" and "relationships".
Entities should be a list of objects, each with "type", "name", and other relevant properties. Include "_resolution_hints" with original identifiers if possible.
Relationships should be a list of objects, each with "source_hints", "target_hints", "type", and properties.
Example Entity: {{"type": "Person", "name": "John Doe", "_resolution_hints": {{"source_id": "user123"}}}}
Example Relationship: {{"source_hints": {{"name": "Company A"}}, "target_hints": {{"name": "Product B"}}, "type": "PRODUCES"}}
JSON Output:"""
            # --- End TODO ---

            # Call generate_text
            fm_request = {
                "payload": {"prompt": prompt, "model_alias": model_alias, "max_tokens": 2048, "temperature": 0.1}, # Low temp for structured output
                "trace_id": trace_id, "request_id": f"fm_kg_extract_{request_id}"
            }
            fm_response = await self.generate_text(input_data=fm_request) # Call internal generate_text

            if fm_response.get("status") == "success":
                generated_text = fm_response.get("payload", {}).get("generated_text", "")
                response_metadata = fm_response.get("metadata", {}) # Get metadata from FM call
                try:
                    # Attempt to parse the JSON output from the LLM
                    kg_data = json.loads(generated_text)
                    if isinstance(kg_data, dict) and "entities" in kg_data and "relationships" in kg_data:
                        status = "success"
                        response_payload = {
                            "entities": kg_data.get("entities", []),
                            "relationships": kg_data.get("relationships", [])
                        }
                    else:
                        status = "failed"; errors.append({"code": "INVALID_LLM_JSON", "message": "LLM did not return valid JSON with 'entities' and 'relationships' keys.", "raw_output": generated_text[:500]})
                except json.JSONDecodeError as json_e:
                    status = "failed"; errors.append({"code": "LLM_JSON_PARSE_ERROR", "message": f"Failed to parse LLM JSON output: {json_e}", "raw_output": generated_text[:500]})
            else:
                 status = "failed"; errors = fm_response.get("error_details") # Propagate errors from generate_text

        except Exception as e:
             status = "internal_error"; errors.append({"code": "KG_EXTRACT_ERROR", "message": str(e)}); logger.error(f"Error extracting KG data with {model_alias}: {e}", exc_info=True)

        response["status"] = status
        response["payload"] = response_payload
        response["metadata"] = response_metadata
        response["metadata"]["total_processing_duration_ms"] = (time.monotonic() - start_time) * 1000
        if errors: response["error_details"] = errors
        return response


# --- Initialization (Conceptual - Service deployed separately) ---
# fm_client: Optional[FoundationModelClient] = None

# async def initialize_fm_client():
#     global fm_client
#     if not _config_obj:
#         logger.critical("Cannot initialize FoundationModelClient: CONFIG_OBJ not loaded.")
#         return
#     try:
#         fm_client = FoundationModelClient(_config_obj)
#         await fm_client.initialize() # Initialize async
#         logger.info("FoundationModelClient initialized successfully.")
#     except Exception as fm_init_e:
#         logger.critical(f"FoundationModelClient init failed: {fm_init_e}", exc_info=True)
#         fm_client = None

# async def cleanup_fm_client():
#      if fm_client:
#          await fm_client.cleanup()
#          logger.info("FoundationModelClient cleaned up.")

# --- Example Usage (Conceptual - How another tool/service might call this) ---
# async def some_other_tool_method(fm_client_proxy: FoundationModelClient):
#      request = {
#          "payload": {"prompt": "Translate to French: Hello", "model_alias": "gemini-1.5-flash-001"},
#          "trace_id": "some-trace-id"
#      }
#      response = await fm_client_proxy.generate_text(input_data=request)
#      if response["status"] == "success":
#          print(response["payload"]["generated_text"])
#      else:
#          print("Error:", response["error_details"])

print("\n--- MIZ 3.0 Foundation Model Client (Cell 18 - New Implementation) ---")
print("Provides unified async interface for Vertex, OpenAI, Anthropic (REST).")
print("Includes rate limiting, retries, caching, cost estimation placeholders.")
print("Handles MIZ OKI I/O structure.")
print("Requires installation of provider SDKs (openai) and aiohttp, tenacity, cachetools.")
print("----------------------------------------------------------------------")

SyntaxError: invalid syntax (<ipython-input-21-e49335905976>, line 44)