In [1]:
# Cell 1: Environment Setup and Configuration (Enhanced for OKI - Refined)
# Status: Uses EnhancedConfig. Stricter secret validation added. LLaMA 4 config placeholders clarified. Orchestrator defaults confirmed.
# MIZ 3.0 OKI: Aligned with need for robust config. LLaMA 4 models specified (as placeholders). Emphasizes infrastructure needs.
# VERDICT: No changes required based on plan review. Code is aligned with Phase 1 goals.

import os
import numpy as np
import pandas as pd
import tensorflow as tf
import datetime
import matplotlib.pyplot as plt
from google.cloud import aiplatform
from google.cloud import storage
from google.cloud import exceptions as gcp_exceptions
import logging
import random
import time
import uuid
import traceback
from abc import ABC, abstractmethod
from collections import deque, defaultdict, Counter
from concurrent.futures import ThreadPoolExecutor # Kept for now, orchestrator refactor needed
from typing import Dict, Any, Optional, List, Union, Callable, TypeVar, Protocol, Tuple, Set, Type
from contextlib import contextmanager
import hashlib
import functools
import heapq
import requests
import json # Keep for loading potential JSON config parts and logging
import re # Added for cleaning BQ column names

# --- Neo4j Import ---
try:
    from neo4j import GraphDatabase, basic_auth
    NEO4J_AVAILABLE = True
except ImportError:
    NEO4J_AVAILABLE = False
    GraphDatabase = None # Placeholder
    basic_auth = None
    print("Warning: 'neo4j' library not found. Install (`pip install neo4j`) for Neo4j integration.")

# --- Enhanced Configuration Class ---
class EnhancedConfig:
    """Handles loading, validation, and access for MIZ 3.0 OKI configuration."""
    def __init__(self, default_config_path=None): # Allow loading from file if needed later
        self.logger = logging.getLogger('MIZ-OKI.Config')
        self._config = {}
        self._load_config()

    def _load_config(self):
        """Load configuration from environment variables and defaults."""
        self.logger.info("Loading MIZ 3.0 OKI configuration...")

        # --- Critical Secrets & Identifiers (Load from Env Vars ONLY, with validation) ---
        self.project_id = os.environ.get("GOOGLE_CLOUD_PROJECT")
        if not self.project_id:
            self.logger.warning("GOOGLE_CLOUD_PROJECT env var not set. Using default 'spry-bus-425315-p6'.")
            self.project_id = "spry-bus-425315-p6"

        self.region = os.environ.get("GOOGLE_CLOUD_REGION", "us-central1")

        self.neo4j_uri = os.environ.get("NEO4J_URI")
        self.neo4j_user = os.environ.get("NEO4J_USER")
        self.neo4j_password = os.environ.get("NEO4J_PASSWORD")
        if not all([self.neo4j_uri, self.neo4j_user, self.neo4j_password]):
             self.logger.warning("NEO4J_URI, NEO4J_USER, or NEO4J_PASSWORD env vars missing. Neo4j connection WILL FAIL if used.")
        elif self.neo4j_password == "password":
             # MIZ 3.0 OKI: Stricter check - halt execution for default password
             self.logger.critical("NEO4J_PASSWORD is set to the default insecure value 'password'. CHANGE THIS IMMEDIATELY.")
             raise ValueError("Insecure default Neo4j password detected. Set NEO4J_PASSWORD environment variable.")

        self.miz_salt = os.environ.get("MIZ_SALT")
        if not self.miz_salt or self.miz_salt == "default_insecure_salt_replace_me_!!":
            # MIZ 3.0 OKI: Stricter check - halt execution for missing/default salt
            self.logger.critical("MIZ_SALT environment variable is missing or set to the default insecure value. SET THIS SECURELY.")
            raise ValueError("Missing or insecure MIZ_SALT detected. Set MIZ_SALT environment variable securely.")

        self.openai_api_key = os.environ.get("OPENAI_API_KEY")
        self.anthropic_api_key = os.environ.get("ANTHROPIC_API_KEY")
        # Add other potential direct keys (Grok, Deepseek) if needed
        # self.grok_api_key = os.environ.get("GROK_API_KEY")
        # self.deepseek_api_key = os.environ.get("DEEPSEEK_API_KEY")

        self.market_news_api_key = os.environ.get("MARKET_NEWS_API_KEY")
        self.competitor_monitor_url = os.environ.get("COMPETITOR_MONITOR_URL")

        # --- General Configuration (Defaults overridden by Env Vars if set) ---
        self._config = {
            # KG parameters
            "kg_storage_type": os.environ.get("KG_STORAGE_TYPE", "neo4j"), # Default to Neo4j
            "neo4j_uri": self.neo4j_uri, # Store loaded secret
            "neo4j_user": self.neo4j_user, # Store loaded secret
            "neo4j_password": self.neo4j_password, # Store loaded secret
            "kg_memory_efficiency": float(os.environ.get("KG_MEMORY_EFFICIENCY", 0.75)), # Less relevant for DB backend
            "entity_resolution_accuracy": float(os.environ.get("ENTITY_RESOLUTION_ACCURACY", 0.995)),
            "context_window": int(os.environ.get("CONTEXT_WINDOW", 128000)), # Base context, Llama 4 can handle more

            # Business impact targets
            "roas_target": float(os.environ.get("ROAS_TARGET", 9.0)),
            "cac_reduction_target": float(os.environ.get("CAC_REDUCTION_TARGET", 0.60)),
            "clv_increase_target": float(os.environ.get("CLV_INCREASE_TARGET", 0.50)),
            "human_intervention_reduction": float(os.environ.get("HUMAN_INTERVENTION_REDUCTION", 0.80)),
            "operational_cost_reduction_target": float(os.environ.get("OPERATIONAL_COST_REDUCTION_TARGET", 0.35)),
            "revenue_growth_target": float(os.environ.get("REVENUE_GROWTH_TARGET", 0.30)),
            "budget_reallocation_efficiency_target": float(os.environ.get("BUDGET_REALLOCATION_EFFICIENCY_TARGET", 0.677)),
            "ad_bidding_efficiency_improvement_target": float(os.environ.get("AD_BIDDING_EFFICIENCY_IMPROVEMENT_TARGET", 0.40)),

            # System architecture parameters
            "max_experts": int(os.environ.get("MAX_EXPERTS", 12)),
            "research_agent_frequency": int(os.environ.get("RESEARCH_AGENT_FREQUENCY", 7)), # Days
            "feedback_threshold": float(os.environ.get("FEEDBACK_THRESHOLD", 0.75)),
            "decision_confidence_threshold": float(os.environ.get("DECISION_CONFIDENCE_THRESHOLD", 0.85)),
            "human_review_confidence_threshold": float(os.environ.get("HUMAN_REVIEW_CONFIDENCE_THRESHOLD", 0.75)),
            "workflow_evolution_check_frequency": int(os.environ.get("WORKFLOW_EVOLUTION_CHECK_FREQUENCY", 20)),
            "workflow_error_rate_threshold": float(os.environ.get("WORKFLOW_ERROR_RATE_THRESHOLD", 0.1)),
            "workflow_step_duration_threshold_ms": int(os.environ.get("WORKFLOW_STEP_DURATION_THRESHOLD_MS", 500)),
            "rtb_min_bid_threshold": float(os.environ.get("RTB_MIN_BID_THRESHOLD", 0.01)),

            # Holistic Objectives (Example structure, could be loaded from JSON file/env var)
            "objectives": [
               {"id": "maximize_profit", "description": "Maximize overall profit", "metrics": ["roas", "cac"], "weight": 0.6, "direction": {"roas": "maximize", "cac": "minimize"}},
               {"id": "enhance_brand", "description": "Enhance brand equity", "metrics": ["brand_sentiment", "brand_awareness"], "weight": 0.2, "direction": {"brand_sentiment": "maximize", "brand_awareness": "maximize"}},
               {"id": "customer_retention", "description": "Improve customer retention", "metrics": ["clv", "churn_rate"], "weight": 0.2, "direction": {"clv": "maximize", "churn_rate": "minimize"}}
            ],
            # Targets & Baselines (Examples, adjust based on actuals or load from file/env var)
            "targets": { "roas": 9.0, "cac": 40.0, "clv": 750.0, "churn_rate": 0.05, "brand_sentiment": 60.0, "brand_awareness": 75.0 },
            "baselines": { "roas": 2.5, "cac": 100.0, "clv": 500.0, "churn_rate": 0.15, "brand_sentiment": 20.0, "brand_awareness": 50.0 },

            "pseudonymization_salt": self.miz_salt, # Store loaded secret

            # Foundation Model Keys (Loaded above)
            "foundation_model_keys": {
                "openai": self.openai_api_key,
                "anthropic": self.anthropic_api_key,
                # "grok": self.grok_api_key,
                # "deepseek": self.deepseek_api_key
            },
            # Foundation Model Defaults (OKI Spec: LLaMA 4 focus)
            "foundation_model_defaults": {
                "vertex": "gemini-1.5-flash-001", # Default Vertex model if Llama 4 not primary
                # OKI: Use actual Vertex AI Llama 3 model IDs as placeholders for Llama 4 roles.
                # Replace these with actual Llama 4 model IDs when available on Vertex AI.
                "llama4_scout": "llama3-8b-instruct", # Placeholder for Llama 4 Scout role
                "llama4_maverick": "llama3-70b-instruct", # Placeholder for Llama 4 Maverick role
                "openai": "gpt-4-turbo",
                "anthropic": "claude-3-5-sonnet-20240620", # Use Sonnet 3.5
            },
            # Foundation Model Pricing (OKI TODO: Update with accurate LLaMA 4 pricing when available)
            "foundation_model_pricing": {
                 "vertex": {
                      "gemini-1.5-flash-001": {"prompt": 0.000125 / 1000, "completion": 0.000375 / 1000}, # Per Char? Assume Token.
                      "gemini-1.0-pro": {"prompt": 0.000125 / 1000, "completion": 0.000375 / 1000},
                      # Llama 3 pricing on Vertex AI (Example - check current pricing)
                      "llama3-8b-instruct": {"prompt": 0.0005 / 1000, "completion": 0.0005 / 1000}, # Example $/1k tokens (Placeholder for Llama 4 Scout)
                      "llama3-70b-instruct": {"prompt": 0.00265 / 1000, "completion": 0.00265 / 1000}, # Example $/1k tokens (Placeholder for Llama 4 Maverick)
                 },
                 "openai": { "gpt-4-turbo": {"prompt": 0.01 / 1000, "completion": 0.03 / 1000}, },
                 "anthropic": { "claude-3-5-sonnet-20240620": {"prompt": 3.0 / 1_000_000, "completion": 15.0 / 1_000_000}, } # $/Million tokens
            },

            # Agent Orchestrator / MoA Config (Production defaults)
            # CRITICAL: These require actual backend setup (Pub/Sub, Postgres/Firestore etc.)
            "task_queue_type": os.environ.get("TASK_QUEUE_TYPE", "pubsub"), # Default to Pub/Sub for production intent
            "task_persistence_type": os.environ.get("TASK_PERSISTENCE_TYPE", "firestore"), # Default to Firestore for production intent
            "task_persistence_filepath": "miz3_orchestrator_state.json", # Only used if type is 'file' (for testing)
            "pubsub_topic": os.environ.get("PUBSUB_TOPIC", "miz3-tasks"),
            "firestore_collection": os.environ.get("FIRESTORE_COLLECTION", "miz3_tasks"),
            "dlq_target": os.environ.get("DLQ_TARGET", "log_only"), # Target for Dead Letter Queue (e.g., pubsub topic, db table, log_only)

            # MoA / B.O.S.S. specific config
            "boss_sub_agent_count": int(os.environ.get("BOSS_SUB_AGENT_COUNT", 5)),
            "mini_model_retraining_trigger_metric": os.environ.get("MINI_MODEL_RETRAINING_TRIGGER_METRIC", "accuracy_drop"),
            "mini_model_retraining_threshold": float(os.environ.get("MINI_MODEL_RETRAINING_THRESHOLD", 0.05)),

            # MLOps Config
            "gcs_bucket_name": os.environ.get("GCS_BUCKET_NAME", f"{self.project_id}-miz3-data"),
            "mlops_pipeline_root": f"gs://{os.environ.get('GCS_BUCKET_NAME', f'{self.project_id}-miz3-data')}/miz3_pipelines",
            "mlops_serving_image": os.environ.get("MLOPS_SERVING_IMAGE", "us-docker.pkg.dev/vertex-ai/prediction/tf2-cpu.2.9:latest"), # Example, adjust TF version

            # External Data Sources (Loaded above)
            "external_data_sources": {
                "market_news_api": self.market_news_api_key,
                "competitor_monitor_url": self.competitor_monitor_url
            },

            # MoA Specific Config (Example structure, load from JSON/Env Var if complex)
            "moa_layer_configs": {
                # Layer IDs map to agent types and the Llama 4 model alias they should primarily use
                "0": {"agents": ["BossAgent"], "model": "llama4_maverick"}, # Boss uses powerful Maverick
                "1": {"agents": ["DataProcessingAgent"], "model": "llama4_scout"}, # Data processing uses efficient Scout
                "2": {"agents": ["AnalysisAgent"], "model": "llama4_maverick"}, # Analysis uses Maverick
                "3": {"agents": ["KnowledgeGraphAgent"], "model": "llama4_scout"}, # KG updates use Scout
                "4": {"agents": ["ActionAgent"], "model": "llama4_scout"} # Action execution uses Scout
            },
            "worker_configs": { # Example structure for potential MoA workers (if used)
                "DataWorker": {"cache_size": 500, "timeout": 600},
                "AnalysisWorker": {"cache_size": 200, "timeout": 1200}
            },
            "quality_threshold": float(os.environ.get("QUALITY_THRESHOLD", 0.75)), # For MoA solver/validation
            "evidence_store_config": {"type": "memory"}, # Placeholder for MoA evidence store config

        }
        # Update derived values
        self._config["mlops_pipeline_root"] = f"gs://{self.get('gcs_bucket_name')}/miz3_pipelines"
        self.logger.info("Configuration loaded.")

    def get(self, key: str, default: Any = None) -> Any:
        """Get a configuration value."""
        return self._config.get(key, default)

    def get_dict(self) -> Dict[str, Any]:
        """Get the entire configuration dictionary."""
        # Return a copy to prevent modification
        return self._config.copy()

    # --- Add methods for specific config sections if needed ---
    def get_model_config(self, model_alias: str) -> Optional[Dict[str, Any]]:
        """Gets the actual model ID and pricing for a given alias (e.g., 'llama4_maverick')."""
        defaults = self.get("foundation_model_defaults", {})
        pricing = self.get("foundation_model_pricing", {})
        model_id = defaults.get(model_alias)
        if not model_id:
            self.logger.warning(f"Model alias '{model_alias}' not found in defaults.")
            return None

        # Find provider based on model ID (simple heuristic)
        provider = None
        # OKI: Update heuristic for Llama 3/4 on Vertex
        if "gemini" in model_id or "llama3" in model_id: provider = "vertex"
        elif "gpt" in model_id: provider = "openai"
        elif "claude" in model_id: provider = "anthropic"
        # Add more heuristics if needed

        if not provider:
             self.logger.warning(f"Could not determine provider for model ID '{model_id}'.")
             return {"id": model_id, "provider": None, "pricing": None}

        model_pricing = pricing.get(provider, {}).get(model_id)
        if not model_pricing:
             self.logger.warning(f"Pricing not found for model ID '{model_id}' under provider '{provider}'.")

        return {"id": model_id, "provider": provider, "pricing": model_pricing}

    def get_layer_config(self, layer_id: Union[int, str]) -> Optional[Dict[str, Any]]:
        """Gets the configuration for a specific MoA layer."""
        return self.get("moa_layer_configs", {}).get(str(layer_id))

    def get_worker_config(self, worker_name: str) -> Optional[Dict[str, Any]]:
         """Gets the configuration for a specific MoA worker."""
         return self.get("worker_configs", {}).get(worker_name)

# --- Global Config Instance ---
config = None
try:
    config = EnhancedConfig()
    PROJECT_ID = config.project_id
    REGION = config.region
    BUCKET_NAME = config.get("gcs_bucket_name")
    # Expose config globally for other cells (ensure this is intended)
    CONFIG = config.get_dict()
except ValueError as config_ve:
     # Catch specific ValueErrors raised for critical missing secrets
     logging.error(f"CRITICAL CONFIGURATION ERROR: {config_ve}. Halting execution.")
     # Set critical vars to None or defaults to prevent downstream errors, but system is unusable
     PROJECT_ID = "spry-bus-425315-p6" # Fallback default
     REGION = "us-central1" # Fallback default
     BUCKET_NAME = f"{PROJECT_ID}-miz3-data" # Fallback default
     CONFIG = {} # Empty config
     # Re-raise or exit here in a real application
     # raise config_ve
except Exception as config_e:
     logging.error(f"CRITICAL: Failed to initialize EnhancedConfig: {config_e}", exc_info=True)
     # Attempt to set critical vars to defaults to allow partial continuation, but log error
     PROJECT_ID = "spry-bus-425315-p6"
     REGION = "us-central1"
     BUCKET_NAME = f"{PROJECT_ID}-miz3-data"
     CONFIG = {}
     logging.error("Falling back to default PROJECT_ID/REGION/BUCKET_NAME and empty CONFIG.")


# --- Component Initialization (Using EnhancedConfig) ---

# Set up logging (ensure it's configured early)
logging.basicConfig(level=logging.INFO,
                    format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')
logger = logging.getLogger('MIZ-OKI') # Global logger

# Initialize Vertex AI SDK
vertex_ai_initialized = False
if config and PROJECT_ID and REGION and BUCKET_NAME: # Check if config loaded successfully
    try:
        # Check if already initialized (more robust check)
        if not aiplatform.constants.global_config.initialized:
            aiplatform.init(project=PROJECT_ID, location=REGION, staging_bucket=f"gs://{BUCKET_NAME}/vertex_staging")
            logger.info(f"Vertex AI SDK initialized for project {PROJECT_ID} in {REGION}.")
        else:
            logger.info("Vertex AI SDK already initialized.")
        vertex_ai_initialized = True
    except Exception as ai_init_e:
        logger.error(f"Failed to initialize Vertex AI SDK: {ai_init_e}", exc_info=True)
else:
     logger.error("Config not loaded or critical values missing, skipping Vertex AI initialization.")

# Set up GCS bucket for storage
storage_client = None
bucket = None
if config and PROJECT_ID and BUCKET_NAME: # Check if config loaded successfully
    try:
        storage_client = storage.Client(project=PROJECT_ID)
        logger.info("Google Cloud Storage client initialized.")
        try:
            bucket = storage_client.get_bucket(BUCKET_NAME)
            logger.info(f"Bucket {BUCKET_NAME} already exists")
        except gcp_exceptions.NotFound:
            logger.info(f"Bucket {BUCKET_NAME} not found, attempting creation.")
            try:
                bucket = storage_client.create_bucket(BUCKET_NAME, location=REGION)
                logger.info(f"Bucket {BUCKET_NAME} created")
            except gcp_exceptions.Conflict:
                 logger.warning(f"Bucket {BUCKET_NAME} likely created by another process. Attempting to get handle.")
                 time.sleep(1)
                 bucket = storage_client.get_bucket(BUCKET_NAME)
            except Exception as create_e:
                logger.error(f"Failed to create bucket {BUCKET_NAME}: {create_e}", exc_info=True)
                bucket = None
        except Exception as e:
            logger.error(f"Failed to get bucket {BUCKET_NAME}: {e}", exc_info=True)
            bucket = None
    except Exception as storage_init_e:
         logger.error(f"Failed to initialize Google Cloud Storage client: {storage_init_e}", exc_info=True)
else:
     logger.error("Config not loaded or critical values missing, skipping GCS initialization.")


# Initialize Neo4j Driver (Placeholder - actual connection managed in Cell 3)
neo4j_driver = None # Driver instance is managed by the adapter in Cell 3
neo4j_connection_verified = False
if config and config.get("kg_storage_type") == "neo4j":
    if NEO4J_AVAILABLE:
        try:
            # Test connection briefly
            test_driver = GraphDatabase.driver(config.get("neo4j_uri"), auth=basic_auth(config.get("neo4j_user"), config.get("neo4j_password")))
            test_driver.verify_connectivity()
            logger.info(f"Successfully verified connectivity to Neo4j at {config.get('neo4j_uri')}")
            test_driver.close()
            neo4j_connection_verified = True
        except Exception as neo4j_e:
            logger.error(f"CRITICAL: Failed to connect to Neo4j at {config.get('neo4j_uri')}: {neo4j_e}")
            logger.error("Ensure Neo4j is running, accessible, and credentials are correct.")
    else:
         logger.error("CRITICAL: Neo4j configured but 'neo4j' library not installed. KG functionality WILL FAIL.")
elif config:
     logger.info(f"KG storage type is '{config.get('kg_storage_type')}'. Neo4j adapter will not be used.")
else:
     logger.error("Config not loaded, skipping Neo4j check.")

# --- Final Status Check ---
logger.info("MIZ 3.0 OKI BGI Platform - Environment configuration complete.")
print("--- MIZ 3.0 OKI Environment Status ---")
if config is None or not CONFIG: # Check if config object or global dict is missing
     print("❌ CRITICAL: Configuration loading failed. System will not function.")
else:
     print("✅ EnhancedConfig Initialized.")
     if not vertex_ai_initialized: print("❌ WARNING: Vertex AI SDK initialization failed. Vertex/LLM features may fail.")
     else: print("✅ Vertex AI SDK Initialized.")
     if bucket is None: print("❌ WARNING: GCS Bucket setup failed. GCS operations WILL FAIL.")
     else: print(f"✅ GCS Bucket '{BUCKET_NAME}' OK.")

     if config.get("kg_storage_type") == "neo4j":
         if NEO4J_AVAILABLE and not neo4j_connection_verified: # Check if connection test failed
              print(f"⚠️ WARNING: Neo4j configured but initial connectivity test FAILED (Check logs). KG functionality may fail.")
         elif not NEO4J_AVAILABLE:
              print(f"❌ ERROR: Neo4j configured but 'neo4j' library not installed. KG functionality WILL FAIL.")
         else:
              print("✅ Neo4j Configured (Connectivity Verified).")
     else:
         print(f"ℹ️ KG Storage Type: '{config.get('kg_storage_type')}'.")

     # Check salt again after potential error handling
     if not config.get("pseudonymization_salt") or config.get("pseudonymization_salt") == "default_insecure_salt_replace_me_!!":
         print("❌ CRITICAL WARNING: Using default or missing pseudonymization salt. SET MIZ_SALT environment variable securely!")
     else:
         print("✅ Pseudonymization Salt Configured.")

     queue_type = config.get('task_queue_type')
     persist_type = config.get('task_persistence_type')
     if queue_type == "memory" or persist_type == "file":
         print(f"⚠️ WARNING: Agent Orchestrator using '{queue_type}' queue and '{persist_type}' persistence. NOT SUITABLE FOR PRODUCTION.")
     else:
         print(f"✅ Agent Orchestrator configured with production-intended queue ('{queue_type}') / persistence ('{persist_type}'). Ensure backends are set up.")
print("------------------------------------")

# Instantiate MoA components if defined globally here (less ideal)
# Example: communication_system = UnifiedCommunicationSystem()
# Example: moa_system = EnhancedMoASystem(config)






CRITICAL:MIZ-OKI.Config:MIZ_SALT environment variable is missing or set to the default insecure value. SET THIS SECURELY.
ERROR:root:CRITICAL CONFIGURATION ERROR: Missing or insecure MIZ_SALT detected. Set MIZ_SALT environment variable securely.. Halting execution.
ERROR:MIZ-OKI:Config not loaded or critical values missing, skipping Vertex AI initialization.
ERROR:MIZ-OKI:Config not loaded or critical values missing, skipping GCS initialization.
ERROR:MIZ-OKI:Config not loaded, skipping Neo4j check.


--- MIZ 3.0 OKI Environment Status ---
❌ CRITICAL: Configuration loading failed. System will not function.
------------------------------------


In [12]:
# Cell 2: Data Extraction and Knowledge Graph Preparation (Enhanced for OKI - Refined & Improved)
# Status: LLaMA 4 semantic processing call refined. Basic multimodal metadata added. Streaming placeholders added. MoA refactor noted. Error handling improved.
# MIZ 3.0 OKI: Aligned with Layer 1 (KG). Implements LLaMA 4 semantic processing via FM Client. Basic multimodal/streaming placeholders.
# FUTURE WORK: Refactor into MoA DataProcessingAgent, replace ThreadPoolExecutor with async processing. Implement full streaming/multimodal analysis.

from google.cloud import storage
from google.cloud import bigquery
from google.cloud import exceptions as gcp_exceptions
import pandas as pd
import numpy as np
import json
import datetime
import logging
import io
import os
import time
from concurrent.futures import ThreadPoolExecutor, as_completed # OKI TODO: Replace with async in MoA Agent
import re
import uuid # Added for BQ column cleaning

# Use the logger defined in Cell 1
logger = logging.getLogger('MIZ-OKI.DataIngestion')

# Assume FoundationModelClient class is defined in Cell 18 and instance is available
# foundation_model_client = foundation_model_client if 'foundation_model_client' in locals() else None

# OKI TODO: Refactor this entire class into a MoA DataProcessingAgent (Layer 1)
# This agent would receive tasks via the orchestrator queue and perform these operations.
class DataIngestionPipeline:
    """
    Manages data extraction, transformation, and preparation for the E-SHKG.
    (MIZ 3.0 OKI Layer 1 - KU Pillar Foundation)
    OKI Enhancement: Integrates LLaMA 4 semantic processing via FoundationModelClient.
    Future Work: Refactor into MoA DataProcessingAgent, replace ThreadPoolExecutor, implement full streaming/multimodal.
    """
    def __init__(self, config: EnhancedConfig, project_id: str, bucket_name: str, foundation_model_client: Optional[Any] = None):
        self.config = config # Use EnhancedConfig object
        self.project_id = project_id
        self.bucket_name = bucket_name
        self.bq_dataset = config.get("bq_dataset", "miz3_data")
        self.data_sources = {}
        self.last_update = {}
        self.logger = logging.getLogger('MIZ-OKI.DataIngestion')
        self.fm_client = foundation_model_client # Store FM client instance
        if not self.fm_client:
             self.logger.warning("FoundationModelClient not provided. Semantic/Multimodal processing will be disabled.")

        # Initialize clients
        try:
             self.storage_client = storage.Client(project=self.project_id)
             self.bigquery_client = bigquery.Client(project=self.project_id)
        except Exception as client_e:
             self.logger.error(f"Failed to initialize GCS/BQ clients: {client_e}", exc_info=True)
             self.storage_client = None
             self.bigquery_client = None
             self.data_sources["gcs"] = False
             self.data_sources["bigquery"] = False
             self.bucket = None
             return

        # Connect to GCS Bucket
        if self.storage_client:
            try:
                self.bucket = self.storage_client.get_bucket(self.bucket_name)
                self.logger.info(f"Successfully connected to GCS bucket: {self.bucket_name}")
                self.data_sources["gcs"] = True
                self.last_update["gcs"] = datetime.datetime.now()
            except gcp_exceptions.NotFound:
                 self.logger.error(f"GCS Bucket {self.bucket_name} not found.")
                 self.data_sources["gcs"] = False
                 self.bucket = None
            except Exception as e:
                self.logger.error(f"Failed to connect to GCS bucket {self.bucket_name}: {e}", exc_info=True)
                self.data_sources["gcs"] = False
                self.bucket = None
        else:
             self.data_sources["gcs"] = False
             self.bucket = None

        # Ensure BQ dataset exists
        if self.bigquery_client:
             self.create_bigquery_dataset()
        else:
             self.data_sources["bigquery"] = False

    def list_available_data(self, prefix=None):
        """List available data in the GCS bucket."""
        if not self.data_sources.get("gcs", False) or not self.bucket:
            self.logger.warning("Not connected to GCS or bucket not available")
            return []
        try:
            blobs = list(self.bucket.list_blobs(prefix=prefix))
            return [blob.name for blob in blobs]
        except Exception as e:
            self.logger.error(f"Error listing GCS data in {self.bucket_name}: {e}", exc_info=True)
            return []

    def read_gcs_file(self, file_path, file_format=None):
        """
        Read a file from GCS bucket.
        OKI Enhancement: Added basic multimodal metadata extraction.
        """
        if not self.data_sources.get("gcs", False) or not self.bucket:
            self.logger.warning("Not connected to GCS or bucket not available")
            return None

        try:
            blob = self.bucket.blob(file_path)
            if not blob.exists():
                 self.logger.error(f"File not found in GCS: {self.bucket_name}/{file_path}")
                 return None

            # Determine file format
            if file_format is None:
                _, ext = os.path.splitext(file_path.lower())
                if ext == '.csv': file_format = 'csv'
                elif ext == '.json': file_format = 'json'
                elif ext == '.parquet': file_format = 'parquet'
                elif ext in ['.jpg', '.jpeg', '.png', '.gif', '.bmp', '.tiff']: file_format = 'image_metadata' # OKI: Handle images
                elif ext in ['.mp4', '.avi', '.mov', '.wmv']: file_format = 'video_metadata' # OKI: Handle videos
                elif ext in ['.txt', '.log'] or not ext: file_format = 'text'
                else:
                    self.logger.error(f"Unsupported file extension '{ext}' for file {file_path}.")
                    return None

            self.logger.info(f"Reading GCS file: {file_path} as {file_format}")

            # Read based on format
            if file_format == 'csv':
                content_bytes = blob.download_as_bytes()
                try:
                    content = content_bytes.decode('utf-8')
                    return pd.read_csv(io.StringIO(content), on_bad_lines='warn')
                except UnicodeDecodeError:
                    self.logger.warning(f"UTF-8 decoding failed for {file_path}. Trying latin-1.")
                    content = content_bytes.decode('latin-1', errors='replace')
                    return pd.read_csv(io.StringIO(content), on_bad_lines='warn')
                except pd.errors.ParserError as e:
                    self.logger.warning(f"CSV parsing failed for {file_path}: {e}. Check delimiter, quoting, or bad lines.")
                    return None
                except Exception as csv_e:
                    self.logger.error(f"Failed to read CSV {file_path}: {csv_e}", exc_info=True)
                    return None
            elif file_format == 'json':
                content_bytes = blob.download_as_bytes()
                try: content = content_bytes.decode('utf-8')
                except UnicodeDecodeError:
                    self.logger.warning(f"UTF-8 decoding failed for JSON {file_path}. Trying latin-1.")
                    content = content_bytes.decode('latin-1', errors='replace')
                if not content.strip():
                    self.logger.warning(f"JSON file is empty: {file_path}")
                    return pd.DataFrame()
                try:
                    # Handle both JSON array and JSON lines (ndjson)
                    if content.strip().startswith('['):
                        return pd.read_json(io.StringIO(content), orient='records')
                    else:
                        return pd.read_json(io.StringIO(content), lines=True)
                except ValueError as json_e:
                    self.logger.error(f"Error parsing JSON file {file_path}: {json_e}. Check format (array vs ndjson).")
                    return None
                except Exception as json_gen_e:
                     self.logger.error(f"Failed to read JSON {file_path}: {json_gen_e}", exc_info=True)
                     return None
            elif file_format == 'parquet':
                try:
                    with io.BytesIO() as buffer:
                        blob.download_to_file(buffer)
                        buffer.seek(0)
                        return pd.read_parquet(buffer)
                except Exception as parquet_e:
                    self.logger.error(f"Failed to read Parquet file {file_path}: {parquet_e}", exc_info=True)
                    return None
            elif file_format == 'text':
                 try:
                      content_string = blob.download_as_text()
                      # Return DataFrame with text content and source
                      return pd.DataFrame([{"text_content": content_string, "source_file": file_path}])
                 except Exception as text_e:
                      self.logger.error(f"Could not read {file_path} as text: {text_e}")
                      return None

            # OKI: Basic multimodal metadata extraction
            elif file_format == 'image_metadata':
                self.logger.info(f"Extracting basic metadata for image: {file_path}")
                metadata = {
                    "source_file": file_path,
                    "gcs_uri": f"gs://{self.bucket_name}/{file_path}",
                    "gcs_size_bytes": blob.size,
                    "gcs_content_type": blob.content_type,
                    "modality": "image",
                    "extracted_at": datetime.datetime.now().isoformat()
                }
                # OKI TODO (Phase 2/3): Call LLaMA 4 multimodal via fm_client (Cell 18) to generate description/tags
                # if self.fm_client and hasattr(self.fm_client, 'describe_image'):
                #     try:
                #         # Assuming async method in fm_client
                #         # description = await self.fm_client.describe_image(image_uri=metadata["gcs_uri"])
                #         description = "Placeholder AI description for image" # Placeholder
                #         metadata["description_ai"] = description
                #     except Exception as img_ai_e:
                #         self.logger.warning(f"LLaMA 4 image description failed for {file_path}: {img_ai_e}")
                return pd.DataFrame([metadata])
            elif file_format == 'video_metadata':
                self.logger.info(f"Extracting basic metadata for video: {file_path}")
                metadata = {
                    "source_file": file_path,
                    "gcs_uri": f"gs://{self.bucket_name}/{file_path}",
                    "gcs_size_bytes": blob.size,
                    "gcs_content_type": blob.content_type,
                    "modality": "video",
                    "extracted_at": datetime.datetime.now().isoformat()
                }
                # OKI TODO (Phase 2/3): Call LLaMA 4 multimodal via fm_client (Cell 18) for video analysis
                return pd.DataFrame([metadata])

            else:
                # This should not be reached if logic is correct
                self.logger.error(f"Logic error: Reached unsupported file format '{file_format}' in read_gcs_file.")
                return None

        except gcp_exceptions.GoogleCloudError as gcs_e:
             self.logger.error(f"GCS error reading file {file_path}: {gcs_e}", exc_info=True)
             return None
        except Exception as e:
            self.logger.error(f"Unexpected error reading file {file_path} from GCS: {e}", exc_info=True)
            return None

    def create_bigquery_dataset(self, dataset_id=None):
        """Creates the BigQuery dataset if it doesn't exist."""
        if not self.bigquery_client:
             self.logger.error("BigQuery client not initialized. Cannot create dataset.")
             self.data_sources["bigquery"] = False
             return False
        if dataset_id is None: dataset_id = self.bq_dataset
        dataset_ref = f"{self.project_id}.{dataset_id}"
        try:
            self.bigquery_client.get_dataset(dataset_ref)
            self.logger.info(f"Dataset {dataset_ref} already exists")
        except gcp_exceptions.NotFound:
            try:
                dataset = bigquery.Dataset(dataset_ref)
                # Use location from config, default to US
                dataset.location = self.config.get("bq_dataset_location", "US")
                self.logger.info(f"Creating dataset {dataset_ref} in location {dataset.location}...")
                self.bigquery_client.create_dataset(dataset, timeout=30)
                self.logger.info(f"Created dataset {dataset_ref}")
            except Exception as create_e:
                 self.logger.error(f"Failed to create dataset {dataset_ref}: {create_e}", exc_info=True)
                 self.data_sources["bigquery"] = False
                 return False
        except Exception as get_e:
             self.logger.error(f"Failed to check for dataset {dataset_ref}: {get_e}", exc_info=True)
             self.data_sources["bigquery"] = False
             return False
        self.data_sources["bigquery"] = True
        self.last_update["bigquery"] = datetime.datetime.now()
        return True

    def load_to_bigquery(self, dataframe, table_id, write_disposition="WRITE_TRUNCATE"):
        """Loads a Pandas DataFrame into a BigQuery table."""
        if not isinstance(dataframe, pd.DataFrame) or dataframe.empty:
            self.logger.warning(f"Empty or invalid DataFrame provided for BQ table {table_id}. Skipping load.")
            return False
        if not self.data_sources.get("bigquery", False) or not self.bigquery_client:
            self.logger.error(f"BigQuery not available or client not initialized. Cannot load to table {table_id}.")
            return False
        full_table_id = f"{self.project_id}.{self.bq_dataset}.{table_id}"
        original_columns = dataframe.columns.tolist()
        try:
            # Clean column names for BQ compatibility
            cleaned_columns, column_map = self._clean_bq_column_names(original_columns)
            dataframe.columns = cleaned_columns
            self.logger.debug(f"Renamed columns for BQ: {column_map}")

            job_config = bigquery.LoadJobConfig(write_disposition=write_disposition, autodetect=True)
            self.logger.info(f"Loading {len(dataframe)} rows to BigQuery table {full_table_id} (Disposition: {write_disposition})...")
            job = self.bigquery_client.load_table_from_dataframe(dataframe, full_table_id, job_config=job_config)
            job.result() # Wait for the job to complete
            table = self.bigquery_client.get_table(full_table_id)
            self.logger.info(f"Successfully loaded {table.num_rows} rows to {full_table_id}")
            return True
        except Exception as e:
            self.logger.error(f"Error loading data to BigQuery table {full_table_id}: {e}", exc_info=True)
            # Restore original columns if renaming happened
            if list(dataframe.columns) != original_columns: dataframe.columns = original_columns
            return False
        finally:
             # Ensure original columns are restored even if BQ load succeeds but subsequent code fails
             if list(dataframe.columns) != original_columns: dataframe.columns = original_columns

    def _clean_bq_column_names(self, columns):
        """Cleans column names for BigQuery compatibility."""
        cleaned_columns = []
        column_map = {}
        seen_names = set()
        for col in columns:
            name = str(col)
            # Replace non-alphanumeric or leading digit with underscore
            name = re.sub(r'\W|^(?=\d)', '_', name)
            # Ensure name doesn't start or end with underscore (common issue)
            name = name.strip('_')
            # Ensure name is not empty after cleaning
            if not name: name = f"col_{uuid.uuid4().hex[:8]}"
            # Truncate long names (BQ limit is 128)
            name = name[:128]
            original_name = name
            count = 1
            # Ensure uniqueness (case-insensitive check)
            while name.lower() in seen_names:
                name = f"{original_name}_{count}"[:128]
                count += 1
            cleaned_columns.append(name)
            seen_names.add(name.lower())
            column_map[col] = name
        return cleaned_columns, column_map

    def extract_and_transform_for_kg(self, file_paths, data_type=None):
        """
        Extract data and transform for KG structure.
        OKI Enhancement: Calls FoundationModelClient for semantic processing.
        OKI TODO: Replace ThreadPoolExecutor with async processing in MoA Agent.
        """
        if not isinstance(file_paths, list): file_paths = [file_paths]

        # Auto-detect data type
        if data_type is None and file_paths:
            first_path_lower = file_paths[0].lower()
            if 'facebook' in first_path_lower or 'meta' in first_path_lower: data_type = 'facebook_ads' # Updated
            elif 'googleads' in first_path_lower or 'adwords' in first_path_lower: data_type = 'google_ads'
            elif 'ga4' in first_path_lower or 'analytics' in first_path_lower: data_type = 'ga4'
            elif 'shopify' in first_path_lower: data_type = 'shopify'
            elif 'klaviyo' in first_path_lower: data_type = 'klaviyo'
            elif 'support' in first_path_lower or 'transcript' in first_path_lower: data_type = 'support_transcript'
            elif 'returns' in first_path_lower: data_type = 'returns_data'
            elif first_path_lower.endswith(('.txt', '.log')): data_type = 'text_log'
            elif first_path_lower.endswith(('.jpg', '.jpeg', '.png')): data_type = 'image'
            elif first_path_lower.endswith(('.mp4', '.avi', '.mov')): data_type = 'video'
            else: data_type = 'generic'
            self.logger.info(f"Auto-detected data type as: {data_type}")

        # Read files in parallel (using ThreadPoolExecutor for now)
        all_data = []
        # OKI TODO: Replace ThreadPoolExecutor with async processing in MoA Agent
        self.logger.warning("Using ThreadPoolExecutor for parallel file reading. Replace with async in MoA Agent.")
        with ThreadPoolExecutor(max_workers=10) as executor:
            future_to_path = {executor.submit(self.read_gcs_file, path): path for path in file_paths}
            for future in as_completed(future_to_path):
                path = future_to_path[future]
                try:
                    data = future.result()
                    if data is not None: all_data.append(data)
                    else: self.logger.warning(f"No data returned from reading file: {path}")
                except Exception as exc:
                    self.logger.error(f"Error reading file {path}: {exc}", exc_info=False)

        if not all_data:
            self.logger.warning("No data successfully read from provided file paths.")
            return {"entities": [], "relationships": []}

        # OKI: Semantic/Multimodal processing for relevant types
        unstructured_types = ['text_log', 'support_transcript', 'external_news', 'returns_data'] # Text-based
        multimodal_types = ['image', 'video'] # Multimodal (metadata extracted, AI analysis needed)

        if data_type in unstructured_types or data_type in multimodal_types:
             self.logger.info(f"Applying semantic/multimodal processing for data type: {data_type}")
             if self.fm_client and hasattr(self.fm_client, 'extract_kg_data_from_content'):
                  # Process each item individually or batch if fm_client supports it
                  all_entities = []
                  all_relationships = []
                  # Use Llama 4 Maverick alias from config for complex semantic tasks
                  # IMPROVEMENT: Get model alias from config, fallback if missing
                  model_alias = self.config.get("foundation_model_defaults", {}).get('llama4_maverick', 'llama3-70b-instruct') # Fallback to Llama3 70b

                  for item_data in all_data:
                       # Prepare content based on type
                       content_to_process = None
                       if data_type in unstructured_types and isinstance(item_data, pd.DataFrame) and 'text_content' in item_data.columns:
                            # Assuming one doc per file for now, concatenate if multiple rows
                            content_to_process = "\n---\n".join(item_data['text_content'].astype(str))
                       elif data_type in multimodal_types and isinstance(item_data, pd.DataFrame) and 'gcs_uri' in item_data.columns:
                            # IMPROVEMENT: Pass URI dict for multimodal analysis
                            content_to_process = {"uri": item_data['gcs_uri'].iloc[0], "modality": data_type}
                       # Add more complex content preparation if needed

                       if content_to_process:
                            try:
                                 # OKI TODO: Make fm_client call async if possible when refactoring to agent
                                 # IMPROVEMENT: Pass explicit model alias
                                 results = self.fm_client.extract_kg_data_from_content(
                                     content=content_to_process,
                                     data_type=data_type,
                                     model_alias=model_alias # Pass alias, client resolves to ID
                                 )
                                 # IMPROVEMENT: Add expected output comment
                                 # Expected results format: {"entities": [ {..., "_resolution_hints": {...}} ], "relationships": [ {..., "source_hints": {...}, "target_hints": {...}} ]}
                                 if results and isinstance(results, dict):
                                      # Basic validation of results
                                      entities = results.get("entities", [])
                                      relationships = results.get("relationships", [])
                                      if isinstance(entities, list) and isinstance(relationships, list):
                                           # IMPROVEMENT: Add validation for hints within LLM results
                                           valid_entities = [e for e in entities if isinstance(e, dict) and e.get('_resolution_hints')]
                                           valid_relationships = [r for r in relationships if isinstance(r, dict) and r.get('source_hints') and r.get('target_hints')]
                                           if len(valid_entities) < len(entities): self.logger.warning(f"LLM results contained {len(entities) - len(valid_entities)} entities missing resolution hints.")
                                           if len(valid_relationships) < len(relationships): self.logger.warning(f"LLM results contained {len(relationships) - len(valid_relationships)} relationships missing source/target hints.")
                                           all_entities.extend(valid_entities)
                                           all_relationships.extend(valid_relationships)
                                      else:
                                           self.logger.warning(f"Invalid entity/relationship list format in LLaMA 4 results for {data_type}.")
                                 else:
                                      self.logger.warning(f"Invalid result format from fm_client for {data_type}.")
                            except Exception as fm_e:
                                 self.logger.error(f"LLaMA 4 processing failed for {data_type}: {fm_e}", exc_info=False)
                       else:
                            self.logger.warning(f"Could not prepare content for LLaMA 4 processing from item: {item_data.head(1) if isinstance(item_data, pd.DataFrame) else type(item_data)}")

                  self.logger.info(f"LLaMA 4 processing yielded {len(all_entities)} entities, {len(all_relationships)} relationships.")
                  # OKI TODO: Add more robust validation step for LLM-extracted entities/relationships
                  return {"entities": all_entities, "relationships": all_relationships}
             else:
                  self.logger.error(f"FoundationModelClient not available or method missing for semantic processing of {data_type}. Returning empty.")
                  return {"entities": [], "relationships": []}

        # Fallback to rule-based transformation for structured types
        else:
            dataframes = [d for d in all_data if isinstance(d, pd.DataFrame)]
            combined_df = None
            if dataframes:
                 try:
                      combined_df = pd.concat(dataframes, ignore_index=True, sort=False)
                      self.logger.info(f"Combined {len(dataframes)} dataframes. Total rows: {len(combined_df)}")
                 except Exception as concat_e:
                      self.logger.error(f"Error concatenating dataframes: {concat_e}. Processing first dataframe only if applicable.")
                      combined_df = dataframes[0] if dataframes else None

            if combined_df is not None:
                self.logger.info(f"Applying rule-based transformation for data type: {data_type}")
                return self._transform_for_knowledge_graph(combined_df, data_type)
            else:
                 self.logger.warning(f"No DataFrame available for rule-based transformation (Type: {data_type}).")
                 return {"entities": [], "relationships": []}


    def _transform_for_knowledge_graph(self, df, data_type):
        """Dispatcher for rule-based transformations. Focuses on extracting attributes and hints for KG layer."""
        processed_ids_placeholder = set() # Placeholder, actual resolution happens in KG layer
        entities = []
        relationships = []
        transform_func = {
            'facebook_ads': self._transform_facebook_ads,
            'google_ads': self._transform_google_ads,
            'ga4': self._transform_ga4,
            'shopify': self._transform_shopify,
            'klaviyo': self._transform_klaviyo,
            'generic': lambda d, p: self._transform_generic(d, data_type, p), # Pass data_type to generic
        }.get(data_type, lambda d, p: self._transform_generic(d, data_type, p))
        try:
            entities, relationships = transform_func(df, processed_ids_placeholder)
            self.logger.info(f"Transformation for {data_type} produced {len(entities)} potential entities, {len(relationships)} potential relationships.")
        except Exception as e:
            self.logger.error(f"Error during transformation for {data_type}: {e}", exc_info=True)
            return {"entities": [], "relationships": []}
        # Basic validation before returning
        final_entities = [e for e in entities if isinstance(e, dict) and e.get('type') and e.get('_resolution_hints')]
        final_relationships = [r for r in relationships if isinstance(r, dict) and r.get('source_hints') and r.get('target_hints') and r.get('type')]
        if len(final_entities) < len(entities): self.logger.warning(f"Filtered out {len(entities) - len(final_entities)} entities with missing type or hints.")
        if len(final_relationships) < len(relationships): self.logger.warning(f"Filtered out {len(relationships) - len(final_relationships)} relationships with missing source/target hints or type.")
        return {"entities": final_entities, "relationships": final_relationships}

    # --- Transformation Helper Functions ---
    def _add_entity(self, entities, entity_dict, processed_ids_placeholder):
        # Ensure type and hints are present
        if not entity_dict.get('type') or not entity_dict.get('_resolution_hints'):
             self.logger.warning(f"Skipping entity due to missing type or resolution hints: {str(entity_dict)[:100]}...")
             return False
        cleaned_entity = {}
        for k, v in entity_dict.items():
            if pd.isna(v): continue
            # Convert numpy types to standard Python types
            if isinstance(v, (np.integer, np.int64)): v = int(v)
            elif isinstance(v, (np.floating, np.float64)): v = float(v)
            elif isinstance(v, np.bool_): v = bool(v)
            elif isinstance(v, (datetime.datetime, datetime.date, pd.Timestamp)): v = v.isoformat()
            # Ensure complex types are JSON serializable (basic check)
            if isinstance(v, (list, dict)) and k not in ['_resolution_hints', '_link_to_entity_hints']: # Allow hints/links
                 try: json.dumps({k: v}) # Test serialization
                 except TypeError:
                      self.logger.warning(f"Skipping non-serializable attribute '{k}' for entity type {entity_dict.get('type')}")
                      continue
            cleaned_entity[k] = v
        entities.append(cleaned_entity)
        return True

    def _add_relationship(self, relationships, rel_dict, processed_ids_placeholder):
        source_hints = rel_dict.get('source_hints')
        target_hints = rel_dict.get('target_hints')
        rel_type = rel_dict.get('type')
        if not source_hints or not target_hints or not rel_type:
             self.logger.warning(f"Skipping relationship due to missing hints or type: {str(rel_dict)[:100]}...")
             return False
        cleaned_rel = {}
        for k, v in rel_dict.items():
             if pd.isna(v): continue
             if isinstance(v, (np.integer, np.int64)): v = int(v)
             elif isinstance(v, (np.floating, np.float64)): v = float(v)
             elif isinstance(v, np.bool_): v = bool(v)
             elif isinstance(v, (datetime.datetime, datetime.date, pd.Timestamp)): v = v.isoformat()
             if isinstance(v, (list, dict)) and k not in ['source_hints', 'target_hints']: # Allow hints
                 try: json.dumps({k: v})
                 except TypeError:
                      self.logger.warning(f"Skipping non-serializable attribute '{k}' for relationship type {rel_type}")
                      continue
             cleaned_rel[k] = v
        relationships.append(cleaned_rel)
        return True

    def _safe_get(self, row, key, default=None):
        """Safely get value from Pandas Series or dict, handling missing keys and NaN."""
        if key is None: return default
        try:
            # Use .get for Series, direct access for dicts
            val = row.get(key, default) if isinstance(row, pd.Series) else row.get(key, default)
            # Check for Pandas/Numpy NaN specifically
            return default if pd.isna(val) else val
        except Exception:
            return default

    def _parse_date(self, date_str, default=None):
        """Parse date string into ISO format, handling errors."""
        if not date_str or pd.isna(date_str): return default
        try:
            # Use infer_datetime_format=True for flexibility, but coerce errors
            dt_obj = pd.to_datetime(date_str, errors='coerce', infer_datetime_format=True)
            if pd.isna(dt_obj):
                 # If parsing fails, return original string if no default, else default
                 self.logger.debug(f"Could not parse date: '{date_str}'. Returning original or default.")
                 return str(date_str) if default is None else default
            return dt_obj.isoformat()
        except Exception as e:
            self.logger.debug(f"Date parsing error for '{date_str}': {e}")
            return str(date_str) if default is None else default

    # --- Platform Specific Transformations (Refined for Phase 1) ---
    # OKI TODO: Implement _transform_ga4, _transform_klaviyo properly in Phase 2/3
    def _transform_facebook_ads(self, df, processed_ids_placeholder):
        entities = []
        relationships = []
        # More robust column mapping
        col_map = {
            "campaign_id": ["campaign_id", "campaign id"],
            "campaign_name": ["campaign_name", "campaign name"],
            "ad_set_id": ["ad_set_id", "ad set id"],
            "ad_set_name": ["ad_set_name", "ad set name"],
            "ad_id": ["ad_id", "ad id"],
            "ad_name": ["ad_name", "ad name"],
            "date": ["date_start", "date", "day"],
            "impressions": ["impressions"],
            "clicks": ["clicks", "link_clicks"],
            "spend": ["amount_spent", "spend"],
            "conversions": ["conversions", "offsite_conversions", "website_conversions", "actions"] # Handle 'actions' which might be list/dict
        }
        active_cols = {key: next((col for col in potentials if col in df.columns), None) for key, potentials in col_map.items()}

        # Check essential identifiers
        if not (active_cols["campaign_id"] or active_cols["campaign_name"]) or \
           not (active_cols["ad_id"] or active_cols["ad_name"]) or \
           not active_cols["date"]:
            logger.error(f"Cannot process Facebook Ads: Missing essential identifiers (Campaign/Ad ID/Name, Date). Available columns: {df.columns.tolist()}")
            return [], []

        for _, row in df.iterrows():
            try:
                date_iso = self._parse_date(self._safe_get(row, active_cols["date"]), default="unknown_date")
                campaign_orig_id = str(self._safe_get(row, active_cols.get("campaign_id"), ''))
                campaign_name = self._safe_get(row, active_cols.get("campaign_name"))
                if not campaign_orig_id and not campaign_name: continue # Skip if no campaign identifier

                campaign_hints = {"platform": "facebook", "type": "campaign"}
                if campaign_orig_id: campaign_hints["original_id"] = campaign_orig_id
                if campaign_name: campaign_hints["name"] = campaign_name
                campaign_entity = { "type": "campaign", "platform": "facebook", "original_id": campaign_orig_id or None, "name": campaign_name, "_resolution_hints": campaign_hints }
                self._add_entity(entities, campaign_entity, processed_ids_placeholder)

                ad_set_hints = None
                ad_set_orig_id = str(self._safe_get(row, active_cols.get("ad_set_id"), ''))
                ad_set_name = self._safe_get(row, active_cols.get("ad_set_name"))
                if ad_set_orig_id or ad_set_name:
                    ad_set_hints = {"platform": "facebook", "type": "ad_set"}
                    if ad_set_orig_id: ad_set_hints["original_id"] = ad_set_orig_id
                    if ad_set_name: ad_set_hints["name"] = ad_set_name
                    ad_set_entity = { "type": "ad_set", "platform": "facebook", "original_id": ad_set_orig_id or None, "name": ad_set_name, "_resolution_hints": ad_set_hints }
                    self._add_entity(entities, ad_set_entity, processed_ids_placeholder)

                ad_orig_id = str(self._safe_get(row, active_cols.get("ad_id"), ''))
                ad_name = self._safe_get(row, active_cols.get("ad_name"))
                if not ad_orig_id and not ad_name: continue # Skip if no ad identifier

                ad_hints = {"platform": "facebook", "type": "ad"}
                if ad_orig_id: ad_hints["original_id"] = ad_orig_id
                if ad_name: ad_hints["name"] = ad_name
                ad_entity = { "type": "ad", "platform": "facebook", "original_id": ad_orig_id or None, "name": ad_name, "_resolution_hints": ad_hints }
                self._add_entity(entities, ad_entity, processed_ids_placeholder)

                # Handle potentially complex 'conversions'/'actions' field
                conversions_val = self._safe_get(row, active_cols.get("conversions"), 0)
                conversions_num = 0.0
                if isinstance(conversions_val, (int, float)):
                     conversions_num = float(conversions_val)
                elif isinstance(conversions_val, list):
                     # Try to find a relevant conversion action (e.g., 'purchase')
                     for action in conversions_val:
                          if isinstance(action, dict) and action.get('action_type') == 'offsite_conversion.fb_pixel_purchase':
                               conversions_num = float(action.get('value', 0))
                               break
                     if conversions_num == 0.0: # Fallback: sum all 'value' fields if purchase not found
                          conversions_num = sum(float(action.get('value', 0)) for action in conversions_val if isinstance(action, dict))
                elif isinstance(conversions_val, str): # Try parsing if it's a JSON string
                     try:
                          parsed_list = json.loads(conversions_val)
                          if isinstance(parsed_list, list):
                               # Apply same logic as above
                               for action in parsed_list:
                                    if isinstance(action, dict) and action.get('action_type') == 'offsite_conversion.fb_pixel_purchase':
                                         conversions_num = float(action.get('value', 0))
                                         break
                               if conversions_num == 0.0:
                                    conversions_num = sum(float(action.get('value', 0)) for action in parsed_list if isinstance(action, dict))
                     except json.JSONDecodeError:
                          self.logger.warning(f"Could not parse 'conversions' string: {conversions_val}")

                perf_entity = {
                    "type": "ad_performance", "platform": "facebook", "date": date_iso,
                    "impressions": float(self._safe_get(row, active_cols.get("impressions"), 0)),
                    "clicks": float(self._safe_get(row, active_cols.get("clicks"), 0)),
                    "spend": float(self._safe_get(row, active_cols.get("spend"), 0)),
                    "conversions": conversions_num,
                    # Add resolution hints for the performance node itself
                    "_resolution_hints": {"type": "ad_performance", "platform": "facebook", "date": date_iso, "ad_original_id": ad_orig_id or None, "ad_name": ad_name},
                    # Add linking hints for relationship creation in KG layer
                    "_link_to_entity_hints": ad_hints,
                    "_link_relationship": "has_performance"
                }
                self._add_entity(entities, perf_entity, processed_ids_placeholder)

                # Add structural relationships
                if ad_set_hints:
                    self._add_relationship(relationships, {"source_hints": campaign_hints, "target_hints": ad_set_hints, "type": "contains"}, processed_ids_placeholder)
                    self._add_relationship(relationships, {"source_hints": ad_set_hints, "target_hints": ad_hints, "type": "contains"}, processed_ids_placeholder)
                else: # Direct link from campaign to ad if no ad set
                    self._add_relationship(relationships, {"source_hints": campaign_hints, "target_hints": ad_hints, "type": "contains"}, processed_ids_placeholder)

            except Exception as row_e:
                self.logger.error(f"Error processing Facebook Ads row: {row_e}", exc_info=False) # Avoid excessive logging
        return entities, relationships

    def _transform_google_ads(self, df, processed_ids_placeholder):
        entities = []; relationships = []; self.logger.info("Processing Google Ads data...")
        col_map = {
            "campaign_id": ["campaign_id", "campaignid"],
            "campaign_name": ["campaign", "campaign_name"],
            "ad_group_id": ["ad_group_id", "adgroupid"],
            "ad_group_name": ["ad_group", "ad_group_name"],
            "ad_id": ["ad_id", "adid", "creative_id"], # Creative ID often used
            "ad_name": ["ad", "ad_name"], # Less common, might be missing
            "date": ["date", "day"],
            "cost": ["cost", "cost_micros"],
            "conversions": ["conversions"],
            "conversion_value": ["conversion_value", "conv_value", "all_conv_value"],
            "clicks": ["clicks"],
            "impressions": ["impressions"],
        }
        active_cols = {key: next((col for col in potentials if col in df.columns), None) for key, potentials in col_map.items()}

        # Check essential identifiers
        if not active_cols["campaign_id"] or not active_cols["ad_group_id"] or not active_cols["ad_id"] or not active_cols["date"]:
            logger.error(f"Cannot process Google Ads: Missing essential IDs (Campaign/AdGroup/Ad ID, Date). Available columns: {df.columns.tolist()}")
            return [], []

        for _, row in df.iterrows():
            try:
                date_iso = self._parse_date(self._safe_get(row, active_cols["date"]), default="unknown_date")
                cost = float(self._safe_get(row, active_cols["cost"], 0))
                # Handle cost in micros
                if active_cols["cost"] and "micros" in active_cols["cost"].lower():
                    cost /= 1_000_000.0

                campaign_orig_id = str(self._safe_get(row, active_cols["campaign_id"], ''))
                campaign_hints = {"platform": "google_ads", "type": "campaign", "original_id": campaign_orig_id}
                campaign_entity = { "type": "campaign", "platform": "google_ads", "original_id": campaign_orig_id, "name": self._safe_get(row, active_cols.get("campaign_name")), "_resolution_hints": campaign_hints }
                self._add_entity(entities, campaign_entity, processed_ids_placeholder)

                ad_group_orig_id = str(self._safe_get(row, active_cols["ad_group_id"], ''))
                ad_group_hints = {"platform": "google_ads", "type": "ad_group", "original_id": ad_group_orig_id}
                ad_group_entity = { "type": "ad_group", "platform": "google_ads", "original_id": ad_group_orig_id, "name": self._safe_get(row, active_cols.get("ad_group_name")), "_resolution_hints": ad_group_hints }
                self._add_entity(entities, ad_group_entity, processed_ids_placeholder)

                ad_orig_id = str(self._safe_get(row, active_cols["ad_id"], ''))
                ad_hints = {"platform": "google_ads", "type": "ad", "original_id": ad_orig_id}
                ad_entity = { "type": "ad", "platform": "google_ads", "original_id": ad_orig_id, "name": self._safe_get(row, active_cols.get("ad_name")), "_resolution_hints": ad_hints }
                self._add_entity(entities, ad_entity, processed_ids_placeholder)

                perf_entity = {
                    "type": "ad_performance", "platform": "google_ads", "date": date_iso,
                    "impressions": float(self._safe_get(row, active_cols.get("impressions"), 0)),
                    "clicks": float(self._safe_get(row, active_cols.get("clicks"), 0)),
                    "cost": cost,
                    "conversions": float(self._safe_get(row, active_cols.get("conversions"), 0)),
                    "conversion_value": float(self._safe_get(row, active_cols.get("conversion_value"), 0)),
                    # Add resolution hints for the performance node itself
                    "_resolution_hints": {"type": "ad_performance", "platform": "google_ads", "date": date_iso, "ad_original_id": ad_orig_id},
                    # Add linking hints for relationship creation in KG layer
                    "_link_to_entity_hints": ad_hints,
                    "_link_relationship": "has_performance"
                }
                self._add_entity(entities, perf_entity, processed_ids_placeholder)

                self._add_relationship(relationships, {"source_hints": campaign_hints, "target_hints": ad_group_hints, "type": "contains"}, processed_ids_placeholder)
                self._add_relationship(relationships, {"source_hints": ad_group_hints, "target_hints": ad_hints, "type": "contains"}, processed_ids_placeholder)
            except Exception as row_e:
                self.logger.error(f"Error processing Google Ads row: {row_e}", exc_info=False)
        return entities, relationships

    def _transform_ga4(self, df, processed_ids_placeholder):
        # OKI TODO: Implement GA4 transformation (Phase 2/3).
        # Needs careful handling of sessions, events, user_pseudo_id, traffic sources, conversions.
        # Requires mapping event parameters to KG attributes/relationships.
        # Example structure: Create Session nodes, User nodes, Event nodes, link them.
        entities = []; relationships = []; logger.warning("GA4 transformation not fully implemented (Phase 2/3).")
        # Placeholder: Extract user and session IDs if available
        user_id_col = next((c for c in df.columns if 'user_pseudo_id' in c or 'user_id' in c), None)
        session_id_col = next((c for c in df.columns if 'ga_session_id' in c or 'session_id' in c), None)
        event_name_col = next((c for c in df.columns if 'event_name' in c), None)
        event_time_col = next((c for c in df.columns if 'event_timestamp' in c), None)

        if user_id_col and session_id_col and event_name_col and event_time_col:
             p_ids = processed_ids_placeholder # Alias for brevity
             for _, row in df.iterrows():
                  try:
                       user_id = str(self._safe_get(row, user_id_col))
                       session_id = str(self._safe_get(row, session_id_col))
                       event_name = self._safe_get(row, event_name_col)
                       event_time_raw = self._safe_get(row, event_time_col)
                       # GA4 timestamps are often microseconds since epoch
                       try:
                            event_time = datetime.datetime.fromtimestamp(int(event_time_raw) / 1_000_000).isoformat()
                       except (ValueError, TypeError):
                            event_time = self._parse_date(event_time_raw) # Fallback parsing

                       if not user_id or not session_id or not event_name or not event_time: continue

                       user_hints = {"platform": "ga4", "type": "user", "original_id": user_id}
                       self._add_entity(entities, {"type": "user", "platform": "ga4", "original_id": user_id, "_resolution_hints": user_hints}, p_ids)

                       session_hints = {"platform": "ga4", "type": "session", "original_id": f"{user_id}_{session_id}"} # Composite ID
                       self._add_entity(entities, {"type": "session", "platform": "ga4", "original_id": f"{user_id}_{session_id}", "user_id": user_id, "_resolution_hints": session_hints}, p_ids)

                       event_id = f"{user_id}_{session_id}_{event_name}_{event_time}" # More unique ID
                       event_hints = {"platform": "ga4", "type": "event", "original_id": event_id}
                       event_props = {k: self._safe_get(row, k) for k in df.columns if k not in [user_id_col, session_id_col]}
                       # Add specific event properties
                       event_props["name"] = event_name
                       event_props["timestamp"] = event_time
                       self._add_entity(entities, {"type": "event", "platform": "ga4", **event_props, "_resolution_hints": event_hints}, p_ids)

                       self._add_relationship(relationships, {"source_hints": user_hints, "target_hints": session_hints, "type": "had_session"}, p_ids)
                       self._add_relationship(relationships, {"source_hints": session_hints, "target_hints": event_hints, "type": "contains_event"}, p_ids)
                  except Exception as row_e:
                       self.logger.error(f"Error processing GA4 row: {row_e}", exc_info=False)
        else:
             logger.warning("Could not find standard GA4 ID columns (user_pseudo_id, ga_session_id, event_name, event_timestamp).")

        return entities, relationships

    def _transform_shopify(self, df, processed_ids_placeholder):
        entities = []; relationships = []; self.logger.info("Processing Shopify data...")
        col_map = {
            "order_id": ["id", "order_id", "name"], # 'name' often includes # prefix
            "customer_id": ["customer_id", "customer.id"], # Handle nested structure if JSON
            "email": ["email", "customer.email", "contact_email"],
            "created_at": ["created_at", "processed_at"],
            "total_price": ["total_price", "total_price_usd"],
            "line_items": ["line_items"],
            "customer_first_name": ["customer.first_name", "billing_address.first_name"],
            "customer_last_name": ["customer.last_name", "billing_address.last_name"],
            "customer_phone": ["phone", "customer.phone", "billing_address.phone"],
            "tags": ["tags", "customer.tags"],
            "source_name": ["source_name"], # e.g., 'web', 'pos'
            "landing_site": ["landing_site"],
            "referring_site": ["referring_site"],
            "utm_source": ["utm_source"], # Check if these exist directly or in referring_site/landing_site
            "utm_medium": ["utm_medium"],
            "utm_campaign": ["utm_campaign"],
        }
        # Helper to get potentially nested values
        def get_nested(row, key_path):
            val = row
            try:
                for key in key_path.split('.'):
                    if isinstance(val, dict): val = val.get(key)
                    elif isinstance(val, pd.Series): val = val.get(key) # Handle Series access
                    else: return None
                return val
            except Exception: return None

        # Find active columns, handling potential nesting
        active_cols = {}
        for key, potentials in col_map.items():
            found_col = None
            for potential in potentials:
                 if '.' in potential: # Nested field check
                      # Check if top-level key exists
                      top_key = potential.split('.')[0]
                      if top_key in df.columns:
                           # Check if a sample row has the nested structure (heuristic)
                           try:
                                # Check non-null sample first
                                sample_val = df[top_key].dropna().iloc[0] if not df[top_key].dropna().empty else None
                                if sample_val is not None and isinstance(sample_val, dict) and get_nested(sample_val, '.'.join(potential.split('.')[1:])) is not None:
                                     found_col = potential # Use the nested path
                                     break
                           except Exception as nested_check_e:
                                self.logger.debug(f"Nested check failed for {potential}: {nested_check_e}")
                                pass # Ignore errors during check
                 elif potential in df.columns: # Direct column check
                      found_col = potential
                      break
            active_cols[key] = found_col


        if not active_cols["order_id"] or not active_cols["created_at"] or not active_cols["total_price"]:
            logger.error(f"Cannot process Shopify Orders: Missing essential fields (Order ID, Created At, Total Price). Available columns: {df.columns.tolist()}")
            return [], []

        for _, row in df.iterrows():
            try:
                order_orig_id_raw = get_nested(row, active_cols["order_id"]) if active_cols.get("order_id") and '.' in active_cols["order_id"] else self._safe_get(row, active_cols.get("order_id"))
                if order_orig_id_raw is None: continue # Skip row if order ID is missing
                order_orig_id = str(order_orig_id_raw).lstrip('#') # Clean potential '#' prefix from 'name' field
                created_at_iso = self._parse_date(get_nested(row, active_cols["created_at"]) if active_cols.get("created_at") and '.' in active_cols["created_at"] else self._safe_get(row, active_cols.get("created_at")), default="unknown_date")

                order_hints = {"platform": "shopify", "type": "order", "original_id": order_orig_id}
                order_entity = {
                    "type": "order", "platform": "shopify", "original_id": order_orig_id,
                    "created_at": created_at_iso,
                    "total_price": float(get_nested(row, active_cols["total_price"]) if active_cols.get("total_price") and '.' in active_cols["total_price"] else self._safe_get(row, active_cols.get("total_price"), 0)),
                    "customer_email": get_nested(row, active_cols.get("email")) if active_cols.get("email") and '.' in active_cols.get("email") else self._safe_get(row, active_cols.get("email")),
                    "source_name": self._safe_get(row, active_cols.get("source_name")),
                    "landing_site": self._safe_get(row, active_cols.get("landing_site")),
                    "referring_site": self._safe_get(row, active_cols.get("referring_site")),
                    "utm_source": self._safe_get(row, active_cols.get("utm_source")),
                    "utm_medium": self._safe_get(row, active_cols.get("utm_medium")),
                    "utm_campaign": self._safe_get(row, active_cols.get("utm_campaign")),
                    "_resolution_hints": order_hints
                }
                self._add_entity(entities, order_entity, processed_ids_placeholder)

                customer_hints = None
                customer_orig_id = str(get_nested(row, active_cols.get("customer_id")) if active_cols.get("customer_id") and '.' in active_cols.get("customer_id") else self._safe_get(row, active_cols.get("customer_id"), ''))
                customer_email = order_entity["customer_email"] # Use email from order entity
                customer_phone = get_nested(row, active_cols.get("customer_phone")) if active_cols.get("customer_phone") and '.' in active_cols.get("customer_phone") else self._safe_get(row, active_cols.get("customer_phone"))
                customer_first_name = get_nested(row, active_cols.get("customer_first_name")) if active_cols.get("customer_first_name") and '.' in active_cols.get("customer_first_name") else self._safe_get(row, active_cols.get("customer_first_name"))
                customer_last_name = get_nested(row, active_cols.get("customer_last_name")) if active_cols.get("customer_last_name") and '.' in active_cols.get("customer_last_name") else self._safe_get(row, active_cols.get("customer_last_name"))
                customer_tags = get_nested(row, active_cols.get("tags")) if active_cols.get("tags") and '.' in active_cols.get("tags") else self._safe_get(row, active_cols.get("tags"))

                # Use email as primary identifier if customer_id is missing but email exists
                if customer_email or customer_orig_id or customer_phone:
                    customer_hints = {"type": "customer"}
                    if customer_orig_id: customer_hints["platform_shopify_id"] = customer_orig_id
                    if customer_email: customer_hints["email"] = customer_email
                    if customer_phone: customer_hints["phone"] = customer_phone
                    # Add names only if trying to resolve based on them is intended
                    # if customer_first_name: customer_hints["first_name"] = customer_first_name
                    # if customer_last_name: customer_hints["last_name"] = customer_last_name

                    customer_entity = {
                        "type": "customer",
                        "original_id": customer_orig_id or None,
                        "email": customer_email, "phone": customer_phone,
                        "first_name": customer_first_name, "last_name": customer_last_name,
                        "tags": customer_tags,
                        "_resolution_hints": customer_hints
                    }
                    self._add_entity(entities, customer_entity, processed_ids_placeholder)

                product_hints_in_order = []
                line_items_data = get_nested(row, active_cols.get("line_items")) if active_cols.get("line_items") and '.' in active_cols.get("line_items") else self._safe_get(row, active_cols.get("line_items"))
                if line_items_data:
                     try:
                          # Handle data that might be JSON string or already list/dict
                          line_items = json.loads(line_items_data) if isinstance(line_items_data, str) else line_items_data
                          if isinstance(line_items, list):
                               for item in line_items:
                                    if not isinstance(item, dict): continue
                                    prod_orig_id = str(item.get("product_id", item.get("id", '')))
                                    prod_sku = item.get("sku")
                                    if prod_orig_id or prod_sku:
                                         prod_hints = {"platform": "shopify", "type": "product"}
                                         if prod_orig_id: prod_hints["original_id"] = prod_orig_id
                                         if prod_sku: prod_hints["sku"] = prod_sku
                                         product_hints_in_order.append(prod_hints)
                                         product_entity = {
                                             "type": "product", "platform": "shopify",
                                             "original_id": prod_orig_id or None,
                                             "name": item.get("name", item.get("title")),
                                             "sku": prod_sku,
                                             "price": float(item.get("price", 0)),
                                             "vendor": item.get("vendor"),
                                             "quantity": int(item.get("quantity", 1)),
                                             "_resolution_hints": prod_hints
                                         }
                                         self._add_entity(entities, product_entity, processed_ids_placeholder)
                     except Exception as li_e:
                          self.logger.warning(f"Could not parse line_items for order {order_orig_id}: {li_e}")

                # Add relationships
                if customer_hints:
                    self._add_relationship(relationships, {"source_hints": customer_hints, "target_hints": order_hints, "type": "placed_order", "date": created_at_iso}, processed_ids_placeholder)
                for prod_hints in product_hints_in_order:
                    self._add_relationship(relationships, {"source_hints": order_hints, "target_hints": prod_hints, "type": "contains_product"}, processed_ids_placeholder)

            except Exception as row_e:
                self.logger.error(f"Error processing Shopify row: {row_e}", exc_info=False)
        return entities, relationships

    def _transform_klaviyo(self, df, processed_ids_placeholder):
        # OKI TODO: Implement Klaviyo transformation (Phase 2/3).
        # Needs handling of email/SMS events (Open, Click, Received), campaign IDs, profile data.
        # Link events to Customer nodes (via email) and Campaign nodes.
        entities = []; relationships = []; logger.warning("Klaviyo transformation not fully implemented (Phase 2/3).")
        # Example structure: Identify event type, timestamp, profile email, campaign/flow ID
        email_col = next((c for c in df.columns if 'email' in c.lower() or 'profile.email' in c.lower()), None)
        event_type_col = next((c for c in df.columns if 'type' == c.lower() or 'metric.name' in c.lower()), None) # Klaviyo often uses 'Metric Name'
        timestamp_col = next((c for c in df.columns if 'timestamp' in c.lower()), None)
        campaign_id_col = next((c for c in df.columns if 'campaign.id' in c.lower() or 'campaign_id' in c.lower()), None)
        campaign_name_col = next((c for c in df.columns if 'campaign.name' in c.lower() or 'campaign_name' in c.lower()), None)

        if email_col and event_type_col and timestamp_col:
             p_ids = processed_ids_placeholder # Alias
             for _, row in df.iterrows():
                  try:
                       email = self._safe_get(row, email_col)
                       event_type = self._safe_get(row, event_type_col)
                       timestamp_raw = self._safe_get(row, timestamp_col)
                       # Klaviyo timestamps are often Unix epoch seconds
                       try:
                            timestamp = datetime.datetime.fromtimestamp(int(timestamp_raw)).isoformat()
                       except (ValueError, TypeError):
                            timestamp = self._parse_date(timestamp_raw) # Fallback parsing

                       if not email or not event_type or not timestamp: continue

                       customer_hints = {"type": "customer", "email": email}
                       self._add_entity(entities, {"type": "customer", "email": email, "_resolution_hints": customer_hints}, p_ids)

                       event_props = {k: self._safe_get(row, k) for k in df.columns} # Include all properties
                       event_id = f"klaviyo_{email}_{event_type}_{timestamp}" # Simple unique ID
                       event_hints = {"platform": "klaviyo", "type": "engagement_event", "original_id": event_id}
                       # Add specific event properties
                       event_props["event_type"] = event_type
                       event_props["timestamp"] = timestamp
                       self._add_entity(entities, {"type": "engagement_event", "platform": "klaviyo", **event_props, "_resolution_hints": event_hints}, p_ids)

                       self._add_relationship(relationships, {"source_hints": customer_hints, "target_hints": event_hints, "type": "had_engagement", "event_type": event_type, "date": timestamp}, p_ids)

                       # Link to campaign if available
                       campaign_id = self._safe_get(row, campaign_id_col)
                       campaign_name = self._safe_get(row, campaign_name_col)
                       if campaign_id or campaign_name:
                            campaign_hints = {"platform": "klaviyo", "type": "campaign"}
                            if campaign_id: campaign_hints["original_id"] = str(campaign_id)
                            if campaign_name: campaign_hints["name"] = campaign_name
                            self._add_entity(entities, {"type": "campaign", "platform": "klaviyo", "original_id": str(campaign_id) if campaign_id else None, "name": campaign_name, "_resolution_hints": campaign_hints}, p_ids)
                            self._add_relationship(relationships, {"source_hints": event_hints, "target_hints": campaign_hints, "type": "part_of_campaign"}, p_ids)
                  except Exception as row_e:
                       self.logger.error(f"Error processing Klaviyo row: {row_e}", exc_info=False)
        else:
             logger.warning("Could not find standard Klaviyo columns (email, event type, timestamp).")

        return entities, relationships

    def _transform_generic(self, df, data_type, processed_ids_placeholder):
        entities = []; relationships = []; self.logger.warning(f"Using generic transformation for data type: {data_type}. Results may be limited.")
        # Try to find a reasonably unique ID column
        id_col = next((col for col in df.columns if 'id' in col.lower() and df[col].nunique() > df.shape[0] * 0.8), None) # Heuristic: mostly unique
        if not id_col and df.shape[0] > 0: # If no good ID, try index as fallback if small
             if df.shape[0] < 10000:
                  df = df.reset_index()
                  id_col = 'index'
             else: # If large and no ID, cannot reliably create entities
                  self.logger.error(f"No suitable 'id' column found for generic transformation of large data type {data_type}. Cannot create entities.")
                  return [], []
        elif not id_col: # Empty dataframe
             return [], []


        for _, row in df.iterrows():
             try:
                  orig_id = str(self._safe_get(row, id_col))
                  entity_hints = {"data_type": data_type, "original_id": orig_id}
                  entity_props = {"type": "generic_entity", "data_type": data_type, "original_id": orig_id}
                  for col in df.columns:
                       if col != id_col:
                            entity_props[col] = self._safe_get(row, col) # Add all other columns as properties
                  entity_props["_resolution_hints"] = entity_hints
                  self._add_entity(entities, entity_props, processed_ids_placeholder)
             except Exception as row_e:
                  self.logger.error(f"Error processing generic row: {row_e}", exc_info=False)

        return entities, relationships

    def process_all_data_for_kg(self):
        """Process all available data from GCS for Knowledge Graph"""
        if not self.data_sources.get("gcs", False):
            self.logger.warning("GCS not available. Cannot process data for KG.")
            return {}
        available_files = self.list_available_data()
        if not available_files:
            self.logger.info("No files found in GCS bucket to process.")
            return {}

        # Group files by inferred type
        file_groups = defaultdict(list)
        for f in available_files:
            f_lower = f.lower()
            # OKI TODO: Add more robust type detection, maybe based on path patterns
            if 'facebook' in f_lower or 'meta' in f_lower: group = 'facebook_ads'
            elif 'googleads' in f_lower: group = 'google_ads'
            elif 'ga4' in f_lower: group = 'ga4'
            elif 'shopify' in f_lower: group = 'shopify'
            elif 'klaviyo' in f_lower: group = 'klaviyo'
            elif 'support' in f_lower or 'transcript' in f_lower: group = 'support_transcript'
            elif 'returns' in f_lower: group = 'returns_data'
            elif f_lower.endswith(('.txt', '.log')): group = 'text_log'
            elif f_lower.endswith(('.jpg', '.jpeg', '.png')): group = 'image'
            elif f_lower.endswith(('.mp4', '.avi', '.mov')): group = 'video'
            else: group = 'generic'
            file_groups[group].append(f)

        results = {"entities": [], "relationships": [], "stats": {}}
        self.logger.info(f"Processing files grouped by type: {list(file_groups.keys())}")

        for data_type, files in file_groups.items():
            self.logger.info(f"Processing {len(files)} files for type: {data_type}")
            start_time = time.time()
            try:
                kg_struct = self.extract_and_transform_for_kg(files, data_type=data_type)
                duration = time.time() - start_time
                results["entities"].extend(kg_struct.get("entities", []))
                results["relationships"].extend(kg_struct.get("relationships", []))
                results["stats"][data_type] = {
                    "files_processed": len(files),
                    "entities_extracted": len(kg_struct.get("entities", [])),
                    "relationships_extracted": len(kg_struct.get("relationships", [])),
                    "duration_seconds": duration,
                    "status": "success"
                }
            except Exception as e:
                 duration = time.time() - start_time
                 self.logger.error(f"Failed processing data type {data_type}: {e}", exc_info=True)
                 results["stats"][data_type] = {
                     "files_processed": len(files), "entities_extracted": 0, "relationships_extracted": 0,
                     "duration_seconds": duration, "status": "error", "error": str(e)
                 }

        total_entities = len(results["entities"])
        total_relationships = len(results["relationships"])
        self.logger.info(f"Finished processing all data. Total Entities: {total_entities}, Total Relationships: {total_relationships}")
        results["stats"]["totals"] = {"entities": total_entities, "relationships": total_relationships}
        return results

    def get_connection_status(self):
        """Get status of all data source connections"""
        return {
            source: {
                "connected": self.data_sources.get(source, False),
                "last_update": self.last_update.get(source).isoformat() if self.last_update.get(source) else None
            } for source in ["gcs", "bigquery"]
        }

    # OKI TODO (Phase 2/3): Implement streaming data ingestion
    def process_stream_event(self, event_data, source_type):
         """Placeholder for processing a single event from a stream."""
         self.logger.info(f"Placeholder: Processing stream event from {source_type}")
         # 1. Parse event_data
         # 2. Transform into entity/relationship format with hints (potentially using LLaMA 4 for complex events)
         # 3. Queue the update for the KnowledgeGraphAgent (via Orchestrator)
         pass

# --- Initialization and Example Usage ---
# Assuming config (EnhancedConfig instance from Cell 1) and foundation_model_client (from Cell 18) exist
data_pipeline = None
if 'config' in locals() and config and PROJECT_ID and BUCKET_NAME:
    # Pass fm_client if available
    _fm_client = foundation_model_client if 'foundation_model_client' in locals() else None
    try:
        data_pipeline = DataIngestionPipeline(config, PROJECT_ID, BUCKET_NAME, foundation_model_client=_fm_client)
        print("--- MIZ 3.0 OKI Data Ingestion Pipeline Initialized (Enhanced) ---")
        print(f"Connection Status: {data_pipeline.get_connection_status()}")
        if not _fm_client:
            print("WARNING: FoundationModelClient not provided. Semantic/Multimodal processing disabled.")
        print("-----------------------------------------------------------------")

        # Example: Process data (assuming files exist in GCS)
        # print("\nAttempting to process data from GCS for KG...")
        # sample_kg_data = data_pipeline.process_all_data_for_kg()
        # print("\nSample KG Data Structure (First 5 Entities/Rels):")
        # print("Entities:", json.dumps(sample_kg_data.get("entities", [])[:5], indent=2, default=str))
        # print("\nRelationships:", json.dumps(sample_kg_data.get("relationships", [])[:5], indent=2, default=str))
        # print("\nStats:", json.dumps(sample_kg_data.get("stats", {}), indent=2))

    except Exception as e:
        print(f"Error initializing or running DataIngestionPipeline: {e}")
        logger.error("DataIngestionPipeline failed.", exc_info=True)
else:
    print("Error: EnhancedConfig, PROJECT_ID, or BUCKET_NAME not defined. Cannot initialize DataIngestionPipeline.")

# OKI TODO (Phase 2/3): Implement full multimodal data handling (image/video analysis using LLaMA 4).
# OKI TODO (Phase 1/2): Refactor this class into a MoA DataProcessingAgent.





Error: EnhancedConfig, PROJECT_ID, or BUCKET_NAME not defined. Cannot initialize DataIngestionPipeline.


In [13]:
# Cell 2.1: Knowledge Graph Loading Step (Enhanced for OKI - Refined & Improved)
# Status: Uses placeholder ProductionDLQ. Added more detailed error checking for bulk results. MoA refactor noted. Dependency on Cell 3 adapter implementation highlighted. DLQ logging enhanced.
# FUTURE WORK: Refactor into MoA KnowledgeGraphAgent. Implement production DLQ backend.

import logging
import datetime
import json
import time
import os

# Assume 'eshkg' (EnhancedSelfHealingKG instance from Cell 3) exists.
# Assume 'logger' is configured (using the global logger from Cell 1).
# Assume EnhancedSelfHealingKG class is defined (from Cell 3)
# Assume EnhancedConfig object 'config' exists from Cell 1

logger_kg_loading = logging.getLogger('MIZ-OKI.KGLoading')

# --- Placeholder Production Dead Letter Queue (DLQ) ---
# OKI TODO: Replace with actual implementation (e.g., Pub/Sub, Database table) based on config.
class ProductionDLQ:
    """Placeholder for a production-ready Dead Letter Queue."""
    def __init__(self, config: EnhancedConfig):
        self.config = config
        # Get DLQ target from config (e.g., 'log_only', 'pubsub:topic_name', 'db:table_name')
        self.dlq_target = config.get("dlq_target", "log_only")
        self.logger = logging.getLogger('MIZ-OKI.ProductionDLQ')
        self.pubsub_publisher = None # Placeholder for Pub/Sub client
        self.db_connection = None # Placeholder for DB connection

        if self.dlq_target != "log_only":
             self.logger.info(f"ProductionDLQ initialized (Target: {self.dlq_target})")
             self._initialize_backend()
        else:
             self.logger.info("ProductionDLQ initialized (Target: log_only)")


    def _initialize_backend(self):
        """Initialize connection to the configured DLQ backend."""
        # OKI TODO: Implement backend initialization based on self.dlq_target
        if self.dlq_target.startswith("pubsub:"):
             # topic_name = self.dlq_target.split(":", 1)[1]
             # Initialize Pub/Sub publisher client
             self.logger.info(f"Placeholder: Initializing Pub/Sub publisher for DLQ topic '{self.dlq_target}'")
             pass
        elif self.dlq_target.startswith("db:"):
             # table_name = self.dlq_target.split(":", 1)[1]
             # Initialize database connection
             self.logger.info(f"Placeholder: Initializing DB connection for DLQ table '{self.dlq_target}'")
             pass
        else:
             self.logger.warning(f"Unknown DLQ target format: '{self.dlq_target}'. Falling back to log_only.")
             self.dlq_target = "log_only"

    def write(self, item_type, item_data, error_message):
        """Writes a failed item to the configured DLQ target."""
        # IMPROVEMENT: Include resolution hints in DLQ data if available
        hints = item_data.get('_resolution_hints') or item_data.get('source_hints') or item_data.get('target_hints')
        dlq_entry = {
            "timestamp": datetime.datetime.now().isoformat(),
            "item_type": item_type, # "entity" or "relationship"
            "data": item_data, # Log the full original data
            "hints": hints, # Log hints separately for easier debugging
            "error": error_message,
            "source_system": "MIZ3_KG_Loading"
        }
        try:
            if self.dlq_target == "log_only":
                # Log as error for visibility
                # Limit data size in log message
                log_data_preview = str(item_data)[:500] + ('...' if len(str(item_data)) > 500 else '')
                self.logger.error(f"DLQ Entry ({item_type}): Error='{error_message}', Hints='{hints}', Data='{log_data_preview}'")
            elif self.dlq_target.startswith("pubsub:"):
                # OKI TODO: Implement Pub/Sub publishing
                # topic_path = self.pubsub_publisher.topic_path(...)
                # data = json.dumps(dlq_entry, default=str).encode("utf-8")
                # future = self.pubsub_publisher.publish(topic_path, data)
                # future.result() # Wait for publish confirmation (optional)
                self.logger.debug(f"Placeholder: Published DLQ entry to Pub/Sub '{self.dlq_target}'")
                pass
            elif self.dlq_target.startswith("db:"):
                # OKI TODO: Implement database insertion
                # cursor = self.db_connection.cursor()
                # cursor.execute("INSERT INTO dlq_table (...) VALUES (...)", (..., json.dumps(item_data), ...))
                # self.db_connection.commit()
                self.logger.debug(f"Placeholder: Inserted DLQ entry into DB '{self.dlq_target}'")
                pass
            else:
                 # Should not happen if _initialize_backend handles fallback
                 self.logger.error(f"DLQ Entry ({item_type}) - Unknown Target '{self.dlq_target}': {json.dumps(dlq_entry, default=str)}")

        except Exception as e:
            # Fallback to logging if primary DLQ target fails
            self.logger.error(f"Failed to write to DLQ target {self.dlq_target}: {e}. Falling back to log.")
            log_data_preview = str(item_data)[:500] + ('...' if len(str(item_data)) > 500 else '')
            self.logger.error(f"DLQ Fallback Entry ({item_type}): Error='{error_message}', Hints='{hints}', Data='{log_data_preview}'")

    def close(self):
         """Close connections if applicable."""
         # OKI TODO: Implement closing logic for Pub/Sub, DB
         self.logger.info(f"Closing DLQ handler (Target: {self.dlq_target}).")
         pass

# Initialize DLQ (assuming config instance exists from Cell 1)
dlq_handler = None
if 'config' in locals() and config:
     try:
          dlq_handler = ProductionDLQ(config)
     except Exception as dlq_init_e:
          logger.error(f"Failed to initialize ProductionDLQ: {dlq_init_e}. Falling back to basic logging.")
          # Fallback to basic logging function if DLQ class fails
          def log_dlq_fallback(item_type, item_data, error_message):
               hints = item_data.get('_resolution_hints') or item_data.get('source_hints') or item_data.get('target_hints')
               log_data_preview = str(item_data)[:200] + ('...' if len(str(item_data)) > 200 else '')
               logger.error(f"DLQ Fallback ({item_type}): Error='{error_message}', Hints='{hints}', Data='{log_data_preview}'")
          dlq_handler = type('obj', (object,), {'write': log_dlq_fallback, 'close': lambda: None})() # Dummy object
else:
     logger.error("Cannot initialize ProductionDLQ: Config not found.")
     # Fallback to basic logging function
     def log_dlq_fallback(item_type, item_data, error_message):
          hints = item_data.get('_resolution_hints') or item_data.get('source_hints') or item_data.get('target_hints')
          log_data_preview = str(item_data)[:200] + ('...' if len(str(item_data)) > 200 else '')
          logger.error(f"DLQ Fallback ({item_type}): Error='{error_message}', Hints='{hints}', Data='{log_data_preview}'")
     dlq_handler = type('obj', (object,), {'write': log_dlq_fallback, 'close': lambda: None})()


# --- Main Loading Function ---
# OKI TODO: Refactor this function into a MoA KnowledgeGraphAgent (Layer 3)
def load_data_into_kg(eshkg_instance, kg_data):
    """
    Loads entities and relationships into the EnhancedSelfHealingKG.
    OKI Enhancement: Uses ProductionDLQ placeholder, improved error checking on bulk results.
    Dependency: Relies on robust bulk/transactional methods implemented in the KG adapter (Cell 3).
    Future Work: Refactor into MoA KnowledgeGraphAgent.
    """
    # --- Input Validation ---
    # IMPROVEMENT: Check for EnhancedSelfHealingKG class existence more robustly
    kg_class_exists = 'EnhancedSelfHealingKG' in globals()
    if not kg_class_exists or not isinstance(eshkg_instance, EnhancedSelfHealingKG):
         logger_kg_loading.error("Invalid or undefined EnhancedSelfHealingKG instance provided. KG loading aborted.")
         return {"success": False, "error": "Invalid KG instance"}
    if not isinstance(kg_data, dict) or ('entities' not in kg_data and 'relationships' not in kg_data):
         logger_kg_loading.error("Invalid kg_data format. Must be dict with 'entities' and/or 'relationships'.")
         return {"success": False, "error": "Invalid data format"}
    if not kg_data.get('entities') and not kg_data.get('relationships'):
         logger_kg_loading.info("No entities or relationships found in kg_data to load.")
         return {"success": True, "message": "No data to load", "stats": {}}

    logger_kg_loading.info(f"Starting Knowledge Graph loading...")
    start_time = time.time()

    # --- Statistics Tracking ---
    stats = {
        "entities_processed": 0, "entities_loaded_new": 0, "entities_updated": 0, "entities_failed": 0,
        "relationships_processed": 0, "relationships_loaded": 0, "relationships_failed": 0,
        "anomalies_fixed_post_load": 0
    }

    # Use transaction context manager from KG adapter
    try:
        # OKI Dependency: Assumes eshkg_instance.transaction() is robustly implemented in Cell 3 adapter
        # IMPROVEMENT: Add comment reinforcing dependency
        logger_kg_loading.info("Attempting to start KG transaction (relies on Cell 3 adapter implementation)...")
        with eshkg_instance.transaction() as tx:
            logger_kg_loading.info("KG transaction started.")

            # --- 1. Load Entities ---
            entities_to_load = kg_data.get('entities', [])
            stats["entities_processed"] = len(entities_to_load)
            logger_kg_loading.info(f"Processing {stats['entities_processed']} entities for loading...")

            if entities_to_load:
                try:
                    # OKI Dependency: Assumes add_entities_bulk in Cell 3 adapter returns dict:
                    # {"new": count, "updated": count, "failed": count, "failures": [{"data": dict, "error": str}]}
                    # IMPROVEMENT: Add comment reinforcing dependency
                    logger_kg_loading.info("Calling add_entities_bulk (relies on Cell 3 adapter implementation)...")
                    bulk_entity_results = eshkg_instance.add_entities_bulk(entities_to_load, source="batch_ingest", transaction=tx)

                    # Validate results structure
                    if not isinstance(bulk_entity_results, dict):
                         raise TypeError(f"add_entities_bulk returned unexpected type: {type(bulk_entity_results)}")

                    stats["entities_loaded_new"] = bulk_entity_results.get("new", 0)
                    stats["entities_updated"] = bulk_entity_results.get("updated", 0)
                    stats["entities_failed"] = bulk_entity_results.get("failed", 0)

                    # Write failed items to DLQ
                    failures = bulk_entity_results.get("failures", [])
                    if not isinstance(failures, list):
                         logger_kg_loading.warning(f"Bulk entity failures format is not a list: {type(failures)}. Cannot process DLQ entries.")
                    else:
                         for failure in failures:
                              if isinstance(failure, dict):
                                   # Ensure data is serializable for DLQ
                                   try: json.dumps(failure.get("data", {}), default=str)
                                   except TypeError: data_to_log = {"error": "Non-serializable data"}
                                   else: data_to_log = failure.get("data", {})
                                   # IMPROVEMENT: Pass original data (data_to_log) to DLQ write
                                   dlq_handler.write("entity", data_to_log, failure.get("error", "Unknown bulk load error"))
                              else:
                                   logger_kg_loading.warning(f"Skipping invalid failure entry in bulk entity results: {failure}")

                except NotImplementedError:
                    logger_kg_loading.warning("Bulk entity loading not implemented by adapter. Falling back to individual loading (less efficient).")
                    # Fallback logic remains same...
                    for entity_dict in entities_to_load:
                        try:
                            result = eshkg_instance.add_entity(entity_dict, source="batch_ingest", transaction=tx)
                            if result and result.get("success"):
                                if result.get("is_new"): stats["entities_loaded_new"] += 1
                                else: stats["entities_updated"] += 1
                            else:
                                stats["entities_failed"] += 1
                                error_msg = result.get("error", "add_entity returned failure") if isinstance(result, dict) else "add_entity failed"
                                dlq_handler.write("entity", entity_dict, error_msg)
                        except Exception as e:
                            stats["entities_failed"] += 1
                            dlq_handler.write("entity", entity_dict, str(e))
                            logger_kg_loading.error(f"Failed to load entity (individual): {e}", exc_info=False)
                except Exception as bulk_e:
                     logger_kg_loading.error(f"Bulk entity loading failed critically: {bulk_e}", exc_info=True)
                     stats["entities_failed"] = len(entities_to_load) # Assume all failed if bulk op fails
                     # Attempt to DLQ all items
                     for entity_dict in entities_to_load:
                          dlq_handler.write("entity", entity_dict, f"Bulk operation failed: {bulk_e}")


            logger_kg_loading.info(f"Entity loading phase complete. New: {stats['entities_loaded_new']}, Updated: {stats['entities_updated']}, Failed: {stats['entities_failed']}")

            # --- 2. Load Relationships ---
            relationships_to_load = kg_data.get('relationships', [])
            stats["relationships_processed"] = len(relationships_to_load)
            logger_kg_loading.info(f"Processing {stats['relationships_processed']} relationships for loading...")

            if relationships_to_load:
                try:
                    # OKI Dependency: Assumes add_relationships_bulk in Cell 3 adapter returns dict:
                    # {"loaded": count, "failed": count, "failures": [{"data": dict, "error": str}]}
                    # IMPROVEMENT: Add comment reinforcing dependency
                    logger_kg_loading.info("Calling add_relationships_bulk (relies on Cell 3 adapter implementation)...")
                    bulk_rel_results = eshkg_instance.add_relationships_bulk(relationships_to_load, transaction=tx)

                    if not isinstance(bulk_rel_results, dict):
                         raise TypeError(f"add_relationships_bulk returned unexpected type: {type(bulk_rel_results)}")

                    stats["relationships_loaded"] = bulk_rel_results.get("loaded", 0)
                    stats["relationships_failed"] = bulk_rel_results.get("failed", 0)

                    failures = bulk_rel_results.get("failures", [])
                    if not isinstance(failures, list):
                         logger_kg_loading.warning(f"Bulk relationship failures format is not a list: {type(failures)}. Cannot process DLQ entries.")
                    else:
                         for failure in failures:
                              if isinstance(failure, dict):
                                   # Ensure data is serializable for DLQ
                                   try: json.dumps(failure.get("data", {}), default=str)
                                   except TypeError: data_to_log = {"error": "Non-serializable data"}
                                   else: data_to_log = failure.get("data", {})
                                   # IMPROVEMENT: Pass original data (data_to_log) to DLQ write
                                   dlq_handler.write("relationship", data_to_log, failure.get("error", "Unknown bulk load error"))
                              else:
                                   logger_kg_loading.warning(f"Skipping invalid failure entry in bulk relationship results: {failure}")

                except NotImplementedError:
                    logger_kg_loading.warning("Bulk relationship loading not implemented by adapter. Falling back to individual loading.")
                    # Fallback logic remains same...
                    for rel_dict in relationships_to_load:
                        try:
                            success = eshkg_instance.add_relationship(rel_dict, transaction=tx)
                            if success: stats["relationships_loaded"] += 1
                            else:
                                stats["relationships_failed"] += 1
                                dlq_handler.write("relationship", rel_dict, "add_relationship returned failure")
                        except Exception as e:
                            stats["relationships_failed"] += 1
                            dlq_handler.write("relationship", rel_dict, str(e))
                            logger_kg_loading.error(f"Failed to load relationship (individual): {e}", exc_info=False)
                except Exception as bulk_e:
                     logger_kg_loading.error(f"Bulk relationship loading failed critically: {bulk_e}", exc_info=True)
                     stats["relationships_failed"] = len(relationships_to_load)
                     for rel_dict in relationships_to_load:
                          dlq_handler.write("relationship", rel_dict, f"Bulk operation failed: {bulk_e}")

            logger_kg_loading.info(f"Relationship loading phase complete. Loaded: {stats['relationships_loaded']}, Failed: {stats['relationships_failed']}")

            # Transaction manager in KG adapter decides commit/rollback based on errors encountered.
            logger_kg_loading.info("Committing transaction (via adapter context manager)...") # Conceptual log

        # --- Transaction finished ---
        logger_kg_loading.info("Transaction finished.")

        # --- 3. Post-Load Maintenance (Self-Healing) ---
        logger_kg_loading.info("Running post-load anomaly detection and healing...")
        try:
            # OKI Dependency: Assumes detect_and_heal_anomalies in Cell 3 uses adapter queries
            fixed_count = eshkg_instance.detect_and_heal_anomalies()
            stats["anomalies_fixed_post_load"] = fixed_count
            logger_kg_loading.info(f"Anomaly detection and healing complete. {fixed_count} anomalies potentially fixed.")
        except Exception as e:
            logger_kg_loading.error(f"Error during post-load maintenance: {e}", exc_info=True)
            stats["error_post_load_maintenance"] = str(e)

    except Exception as tx_e:
        # Catch errors initiating or during the transaction context
        logger_kg_loading.error(f"Transaction failed: {tx_e}", exc_info=True)
        stats["error_transaction"] = str(tx_e)
        # Assume partial failures were logged to DLQ within the try block.
        return {"success": False, "error": f"Transaction failed: {tx_e}", "stats": stats}

    # --- Final Logging ---
    end_time = time.time()
    duration = end_time - start_time
    final_kg_stats = eshkg_instance.get_stats() # Assumes get_stats uses adapter
    final_nodes = final_kg_stats.get("nodes", "N/A")
    final_edges = final_kg_stats.get("edges", "N/A")

    logger_kg_loading.info(f"Knowledge Graph loading finished in {duration:.2f} seconds.")
    logger_kg_loading.info(f"Final Graph Stats - Nodes: {final_nodes}, Edges: {final_edges}")
    stats["duration_seconds"] = duration
    stats["final_node_count"] = final_nodes
    stats["final_edge_count"] = final_edges
    # Consider load successful if *some* data loaded, but log failures
    stats["success"] = (stats["entities_loaded_new"] + stats["entities_updated"] + stats["relationships_loaded"]) > 0
    if stats["entities_failed"] > 0 or stats["relationships_failed"] > 0:
         logger_kg_loading.warning(f"Load completed with {stats['entities_failed']} entity failures and {stats['relationships_failed']} relationship failures. Check DLQ.")

    return stats


# --- Example Usage ---
# Assuming eshkg (from Cell 3) and sample_kg_data (from Cell 2) exist
# if 'eshkg' in locals() and eshkg and 'sample_kg_data' in locals() and sample_kg_data:
#     print("\n--- Running KG Loading Step (Enhanced) ---")
#     loading_results = load_data_into_kg(eshkg, sample_kg_data)
#     print("\nKG Loading Results:")
#     print(json.dumps(loading_results, indent=2, default=str))
#     print("------------------------------------------")
# else:
#     print("\nSkipping KG Loading example: 'eshkg' or 'sample_kg_data' not available or empty.")

# --- Cleanup DLQ Handler ---
# Ensure cleanup happens, e.g., at the end of the notebook session or application lifecycle
# if dlq_handler:
#     dlq_handler.close()

# OKI TODO (Phase 1/2): Refactor this function into a MoA KnowledgeGraphAgent.



















ERROR:MIZ-OKI.DataIngestion:Cannot initialize ProductionDLQ: Config not found.


In [26]:
# Cell 3: Knowledge Graph Layer Implementation (MIZ 3.0 OKI - Reworked for Neo4j)
# Status: Neo4jAdapter implemented with core/bulk methods. ESHKG uses adapter. Anomaly Detector uses Cypher. NetworkX removed. Vector index placeholders added.

import os
import numpy as np
import pandas as pd
import networkx as nx # Kept only for potential external graph analysis, NOT core storage
import datetime
import json
import logging
import time
import uuid
import hashlib
from collections import defaultdict, deque
from contextlib import contextmanager
from typing import Dict, Any, Optional, List, Union, Tuple, Set

# --- Neo4j Integration ---
try:
    from neo4j import GraphDatabase, basic_auth, exceptions as neo4j_exceptions
    NEO4J_AVAILABLE = True
except ImportError:
    NEO4J_AVAILABLE = False
    GraphDatabase = None
    basic_auth = None
    neo4j_exceptions = None
    print("CRITICAL: 'neo4j' library not found. Install (`pip install neo4j`) for KG functionality.")

# Assuming dependencies from other cells
# from cell1 import EnhancedConfig, CONFIG # Use EnhancedConfig instance
# from cell6 import NeuralProcessing # For embedding generation if done here

# Use the global logger
logger = logging.getLogger('MIZ-OKI.KnowledgeGraph')

# --- Graph Storage Adapter Interface ---
class GraphStorageAdapter(ABC):
    """Abstract base class for different graph database backends."""
    @abstractmethod
    def connect(self): pass
    @abstractmethod
    def close(self): pass
    @abstractmethod
    def execute_query(self, query: str, parameters: Optional[Dict] = None) -> List[Dict]: pass
    @abstractmethod
    @contextmanager
    def transaction(self): pass
    @abstractmethod
    def add_entity(self, entity_id: str, entity_type: str, properties: Dict, source: str, transaction=None) -> Dict: pass
    @abstractmethod
    def add_relationship(self, source_hints: Dict, target_hints: Dict, rel_type: str, properties: Dict, transaction=None) -> bool: pass
    @abstractmethod
    def add_entities_bulk(self, entities: List[Dict], source: str, transaction=None) -> Dict: pass
    @abstractmethod
    def add_relationships_bulk(self, relationships: List[Dict], transaction=None) -> Dict: pass
    @abstractmethod
    def get_entity(self, entity_id: str) -> Optional[Dict]: pass
    @abstractmethod
    def find_entity_by_hints(self, hints: Dict, transaction=None) -> Optional[str]: pass
    @abstractmethod
    def get_neighbors(self, entity_id: str, relationship_type: Optional[str] = None, direction: str = "both") -> List[Dict]: pass
    @abstractmethod
    def find_path(self, start_node_hints: Dict, end_node_hints: Dict, relationship_types: Optional[List[str]] = None, max_depth: int = 5) -> Optional[List[Dict]]: pass
    @abstractmethod
    def get_schema(self) -> Dict: pass
    @abstractmethod
    def get_stats(self) -> Dict: pass
    # --- Vector Index Methods ---
    @abstractmethod
    def create_vector_index(self, index_name: str, node_label: str, property_name: str, dimensions: int, similarity_fn: str = 'cosine'): pass
    @abstractmethod
    def add_node_embedding(self, node_id: str, embedding: List[float], index_name: str, transaction=None): pass
    @abstractmethod
    def search_by_vector(self, query_vector: List[float], index_name: str, k: int = 5) -> List[Tuple[str, float]]: pass

# --- Neo4j Adapter Implementation ---
class Neo4jAdapter(GraphStorageAdapter):
    """Adapter for interacting with a Neo4j graph database."""
    def __init__(self, config: 'EnhancedConfig'): # Use EnhancedConfig
        self.config = config
        self.uri = config.get("neo4j_uri")
        self.user = config.get("neo4j_user")
        self.password = config.get("neo4j_password")
        self._driver = None
        self.logger = logging.getLogger('MIZ-OKI.Neo4jAdapter')
        if not NEO4J_AVAILABLE:
            raise ImportError("Neo4j library not installed. Cannot use Neo4jAdapter.")
        if not all([self.uri, self.user, self.password]):
             raise ValueError("Neo4j URI, user, or password missing in configuration.")
        self.connect()

    def connect(self):
        """Establish connection to the Neo4j database."""
        if self._driver:
            self.logger.info("Neo4j driver already connected.")
            return
        try:
            self._driver = GraphDatabase.driver(self.uri, auth=basic_auth(self.user, self.password))
            self._driver.verify_connectivity()
            self.logger.info(f"Successfully connected to Neo4j at {self.uri}")
            self._ensure_constraints() # Ensure basic constraints exist
        except neo4j_exceptions.AuthError as auth_e:
             self.logger.error(f"Neo4j authentication failed for user '{self.user}': {auth_e}")
             self._driver = None
             raise ConnectionRefusedError(f"Neo4j authentication failed: {auth_e}") from auth_e
        except neo4j_exceptions.ServiceUnavailable as conn_e:
            self.logger.error(f"Could not connect to Neo4j at {self.uri}: {conn_e}")
            self._driver = None
            raise ConnectionRefusedError(f"Neo4j connection failed: {conn_e}") from conn_e
        except Exception as e:
            self.logger.error(f"Unexpected error connecting to Neo4j: {e}")
            self._driver = None
            raise ConnectionError(f"Unexpected Neo4j connection error: {e}") from e

    def close(self):
        """Close the Neo4j driver connection."""
        if self._driver:
            try:
                self._driver.close()
                self.logger.info("Neo4j connection closed.")
            except Exception as e:
                self.logger.error(f"Error closing Neo4j connection: {e}")
            finally:
                self._driver = None

    def _ensure_constraints(self):
        """Ensure basic unique constraints for faster lookups (e.g., on mizId)."""
        # MIZ 3.0 TODO: Define constraints based on the final KG schema.
        # Example: Ensure unique mizId for entities that should have one.
        constraints_queries = [
            "CREATE CONSTRAINT unique_mizId IF NOT EXISTS FOR (n:Entity) REQUIRE n.mizId IS UNIQUE",
            # Add constraints for specific entity types if needed
            # "CREATE CONSTRAINT unique_customer_id IF NOT EXISTS FOR (c:Customer) REQUIRE c.mizId IS UNIQUE",
        ]
        try:
            with self.transaction() as tx:
                for query in constraints_queries:
                    try:
                        tx.run(query)
                        self.logger.info(f"Applied constraint: {query.split(' REQUIRE')[0]}")
                    except neo4j_exceptions.ClientError as e:
                        # Ignore if constraint already exists (common error code)
                        if "already exists" in str(e).lower():
                            self.logger.debug(f"Constraint likely already exists: {query.split(' REQUIRE')[0]}")
                        else:
                            raise # Re-raise other client errors
        except Exception as e:
            self.logger.warning(f"Failed to ensure constraints (may impact performance): {e}")

    def execute_query(self, query: str, parameters: Optional[Dict] = None) -> List[Dict]:
        """Execute a Cypher query and return results."""
        if not self._driver: raise ConnectionError("Neo4j driver not connected.")
        parameters = parameters or {}
        try:
            # Use read_transaction for read-only queries if identifiable, else write_transaction
            # Simple heuristic: check for CREATE, MERGE, SET, DELETE, REMOVE
            is_write = any(kw in query.upper() for kw in ["CREATE", "MERGE", "SET", "DELETE", "REMOVE", "CALL apoc"])
            fn = self._execute_write if is_write else self._execute_read
            records, _, _ = self._driver.execute_query(query, parameters, database_="neo4j", routing_="w" if is_write else "r")
            # Convert Neo4j records to dictionaries
            return [record.data() for record in records]
        except neo4j_exceptions.ClientError as e:
            self.logger.error(f"Cypher query syntax error: {e}\nQuery: {query}\nParams: {parameters}")
            raise ValueError(f"Cypher query failed: {e}") from e
        except Exception as e:
            self.logger.error(f"Error executing Cypher query: {e}\nQuery: {query}\nParams: {parameters}")
            raise RuntimeError(f"Neo4j query execution failed: {e}") from e

    @contextmanager
    def transaction(self):
        """Provides a transactional context."""
        if not self._driver: raise ConnectionError("Neo4j driver not connected.")
        session = None
        tx = None
        try:
            session = self._driver.session(database_="neo4j")
            tx = session.begin_transaction()
            self.logger.debug("Neo4j transaction started.")
            yield tx # The transaction object is yielded
            tx.commit()
            self.logger.debug("Neo4j transaction committed.")
        except Exception as e:
            self.logger.error(f"Neo4j transaction failed: {e}", exc_info=True)
            if tx and tx.is_open():
                try:
                    tx.rollback()
                    self.logger.warning("Neo4j transaction rolled back.")
                except Exception as rb_e:
                    self.logger.error(f"Error during transaction rollback: {rb_e}")
            raise # Re-raise the original exception
        finally:
            if session:
                session.close()

    def _build_merge_clause(self, hints: Dict, variable: str = 'n') -> Tuple[str, Dict]:
        """Builds MERGE clause and parameters based on hints."""
        params = {}
        merge_parts = []
        node_label = hints.get('type', 'Entity') # Default label

        # Prioritize mizId if available
        if 'mizId' in hints:
             merge_parts.append(f"{variable}.mizId = $mizId_hint")
             params['mizId_hint'] = hints['mizId']
        # Use other unique identifiers if mizId not present
        elif hints.get('platform') and hints.get('original_id'):
             merge_parts.append(f"{variable}.platform = $platform_hint")
             merge_parts.append(f"{variable}.original_id = $original_id_hint")
             params['platform_hint'] = hints['platform']
             params['original_id_hint'] = hints['original_id']
        elif hints.get('email'):
             merge_parts.append(f"{variable}.email = $email_hint")
             params['email_hint'] = hints['email']
        # Add more specific hint combinations as needed

        if not merge_parts:
             raise ValueError(f"Insufficient hints to build MERGE clause for node {variable}: {hints}")

        merge_clause = f"MERGE ({variable}:{node_label} {{ {', '.join(merge_parts)} }})"
        return merge_clause, params

    def find_entity_by_hints(self, hints: Dict, transaction=None) -> Optional[str]:
        """Finds an entity's internal ID based on hints using MATCH."""
        if not hints or not hints.get('type'):
            self.logger.warning("Cannot find entity: 'type' hint is required.")
            return None

        try:
            # Build MATCH clause similar to MERGE, but for finding
            match_parts = []
            params = {}
            node_label = hints['type']

            if 'mizId' in hints:
                 match_parts.append(f"n.mizId = $mizId_hint")
                 params['mizId_hint'] = hints['mizId']
            elif hints.get('platform') and hints.get('original_id'):
                 match_parts.append(f"n.platform = $platform_hint")
                 match_parts.append(f"n.original_id = $original_id_hint")
                 params['platform_hint'] = hints['platform']
                 params['original_id_hint'] = hints['original_id']
            elif hints.get('email'):
                 match_parts.append(f"n.email = $email_hint")
                 params['email_hint'] = hints['email']
            # Add more specific hint combinations

            if not match_parts:
                 self.logger.warning(f"Insufficient hints to build MATCH clause: {hints}")
                 return None

            query = f"MATCH (n:{node_label} {{ {', '.join(match_parts)} }}) RETURN n.mizId AS mizId LIMIT 1"

            def _run_find(tx):
                result = tx.run(query, params)
                record = result.single()
                return record['mizId'] if record else None

            if transaction:
                return _run_find(transaction)
            else:
                with self._driver.session(database_="neo4j") as session:
                    return session.execute_read(_run_find)

        except Exception as e:
            self.logger.error(f"Error finding entity by hints {hints}: {e}")
            return None

    def add_entity(self, entity_dict: Dict, source: str, transaction=None) -> Dict:
        """Adds or updates an entity in Neo4j using MERGE based on hints."""
        if not isinstance(entity_dict, dict): raise TypeError("entity_dict must be a dictionary.")
        hints = entity_dict.get('_resolution_hints')
        if not hints or not hints.get('type'):
            raise ValueError("Entity data must contain '_resolution_hints' with at least a 'type'.")

        entity_type = hints['type']
        # Ensure mizId exists, generate if needed
        mizId = hints.get('mizId') or entity_dict.get('mizId') or f"{entity_type}:{uuid.uuid4()}"
        hints['mizId'] = mizId # Ensure hint has the final mizId

        properties = {k: v for k, v in entity_dict.items() if not k.startswith('_')}
        properties['mizId'] = mizId # Ensure mizId is a property
        properties['entity_type'] = entity_type # Store type explicitly if needed
        properties['source'] = source
        properties['created_at'] = datetime.now().isoformat()
        properties['updated_at'] = properties['created_at']

        # Remove hints from properties to avoid redundancy
        properties.pop('_resolution_hints', None)
        properties.pop('_link_to_entity_hints', None)
        properties.pop('_link_relationship', None)

        try:
            merge_clause, merge_params = self._build_merge_clause(hints, 'n')
            params = {**merge_params, 'props': properties}

            # Use ON CREATE for initial set, ON MATCH for updates
            query = f"""
            {merge_clause}
            ON CREATE SET n = $props, n.created_at = $props.created_at
            ON MATCH SET n += $props, n.updated_at = $props.updated_at
            RETURN n.mizId AS mizId, n.created_at = $props.created_at AS isNew
            """

            def _run_merge(tx):
                result = tx.run(query, params)
                record = result.single()
                if record:
                    return {"success": True, "mizId": record["mizId"], "is_new": record["isNew"]}
                else:
                    # Should not happen with MERGE but handle defensively
                    return {"success": False, "error": "MERGE operation did not return expected result."}

            if transaction:
                return _run_merge(transaction)
            else:
                with self._driver.session(database_="neo4j") as session:
                    return session.execute_write(_run_merge)

        except Exception as e:
            self.logger.error(f"Error adding/updating entity with hints {hints}: {e}", exc_info=True)
            return {"success": False, "error": str(e)}

    def add_relationship(self, rel_dict: Dict, transaction=None) -> bool:
        """Adds or updates a relationship between two entities using MERGE."""
        source_hints = rel_dict.get('source_hints')
        target_hints = rel_dict.get('target_hints')
        rel_type = rel_dict.get('type')
        if not source_hints or not target_hints or not rel_type:
            raise ValueError("Relationship data must contain 'source_hints', 'target_hints', and 'type'.")

        properties = {k: v for k, v in rel_dict.items() if k not in ['source_hints', 'target_hints', 'type']}
        properties['updated_at'] = datetime.now().isoformat()

        try:
            source_merge, source_params = self._build_merge_clause(source_hints, 'a')
            target_merge, target_params = self._build_merge_clause(target_hints, 'b')

            params = {**source_params, **target_params, 'rel_props': properties}
            # Escape relationship type if it contains invalid characters (basic)
            safe_rel_type = "".join(c if c.isalnum() or c == '_' else '_' for c in rel_type)

            # MERGE relationship, update properties on match or create
            query = f"""
            {source_merge}
            {target_merge}
            MERGE (a)-[r:{safe_rel_type}]->(b)
            ON CREATE SET r = $rel_props, r.created_at = $rel_props.updated_at
            ON MATCH SET r += $rel_props, r.updated_at = $rel_props.updated_at
            RETURN count(r) as rel_count
            """

            def _run_rel_merge(tx):
                result = tx.run(query, params)
                record = result.single()
                return record and record["rel_count"] > 0

            if transaction:
                return _run_rel_merge(transaction)
            else:
                with self._driver.session(database_="neo4j") as session:
                    return session.execute_write(_run_rel_merge)

        except Exception as e:
            self.logger.error(f"Error adding/updating relationship {rel_type} between {source_hints} and {target_hints}: {e}", exc_info=True)
            return False

    def add_entities_bulk(self, entities: List[Dict], source: str, transaction=None) -> Dict:
        """Adds/updates entities in bulk using UNWIND and MERGE."""
        if not entities: return {"new": 0, "updated": 0, "failed": 0, "failures": []}

        batch_data = []
        failures = []
        processed_count = 0
        start_time = time.time()

        for entity_dict in entities:
            processed_count += 1
            try:
                hints = entity_dict.get('_resolution_hints')
                if not hints or not hints.get('type'):
                    raise ValueError("Missing '_resolution_hints' or 'type'.")

                entity_type = hints['type']
                mizId = hints.get('mizId') or entity_dict.get('mizId') or f"{entity_type}:{uuid.uuid4()}"
                hints['mizId'] = mizId

                properties = {k: v for k, v in entity_dict.items() if not k.startswith('_')}
                properties['mizId'] = mizId
                properties['entity_type'] = entity_type
                properties['source'] = source
                properties['updated_at'] = datetime.now().isoformat() # Use same timestamp for batch consistency

                # Build hint keys/values for the MERGE clause dynamically
                merge_keys = {}
                if 'mizId' in hints:
                     merge_keys['mizId'] = hints['mizId']
                elif hints.get('platform') and hints.get('original_id'):
                     merge_keys['platform'] = hints['platform']
                     merge_keys['original_id'] = hints['original_id']
                elif hints.get('email'):
                     merge_keys['email'] = hints['email']
                # Add more hint combinations

                if not merge_keys:
                     raise ValueError("Insufficient hints for MERGE.")

                batch_data.append({
                    "merge_keys": merge_keys,
                    "label": entity_type,
                    "props": properties
                })
            except Exception as e:
                failures.append({"data": entity_dict, "error": str(e)})

        if not batch_data:
             duration = time.time() - start_time
             self.logger.error(f"Bulk entity add failed: No valid data in batch after processing {processed_count} items ({duration:.2f}s).")
             return {"new": 0, "updated": 0, "failed": len(failures), "failures": failures}

        # Cypher query using UNWIND
        # Note: Building the MERGE dynamically based on keys in `item.merge_keys` is complex in pure Cypher.
        # A common approach is to MERGE on a primary key (like mizId) if available, or handle different key sets separately.
        # Here, we assume MERGE on mizId for simplicity, which requires mizId to be generated beforehand.
        query = """
        UNWIND $batch AS item
        MERGE (n {mizId: item.merge_keys.mizId})
        ON CREATE SET n = item.props, n.created_at = item.props.updated_at, n:%s // Set label on create
        ON MATCH SET n += item.props, n.updated_at = item.props.updated_at, n:%s // Ensure label exists on match
        WITH n, item.props.updated_at AS updateTime, n.created_at = item.props.updated_at AS isNew
        RETURN sum(CASE WHEN isNew THEN 1 ELSE 0 END) AS newCount,
               sum(CASE WHEN NOT isNew THEN 1 ELSE 0 END) AS updatedCount
        """ % (item.label, item.label) # Inject label - CAUTION: Ensure label is safe

        # MIZ 3.0 TODO: A more robust approach uses apoc.merge.node or handles different merge key sets conditionally.
        # Example using mizId as the primary merge key:
        query_mizid = """
        UNWIND $batch AS item
        MERGE (n {mizId: item.props.mizId})
        ON CREATE SET n = item.props, n.created_at = item.props.updated_at
        ON MATCH SET n += item.props, n.updated_at = item.props.updated_at
        // Dynamically set label (requires APOC or careful handling)
        CALL apoc.create.addLabels(n, [item.label]) YIELD node
        WITH n, item.props.updated_at AS updateTime, n.created_at = item.props.updated_at AS isNew
        RETURN sum(CASE WHEN isNew THEN 1 ELSE 0 END) AS newCount,
               sum(CASE WHEN NOT isNew THEN 1 ELSE 0 END) AS updatedCount
        """
        # Using query_mizid assumes APOC is installed or labels are managed differently.
        # We'll use the simpler label injection for now, assuming labels are safe.

        params = {"batch": batch_data}
        new_count = 0
        updated_count = 0

        try:
            def _run_bulk_entities(tx):
                result = tx.run(query_mizid, params) # Use query_mizid if APOC available
                # result = tx.run(query, params) # Use simpler query otherwise
                record = result.single()
                return record["newCount"] if record else 0, record["updatedCount"] if record else 0

            if transaction:
                new_count, updated_count = _run_bulk_entities(transaction)
            else:
                with self._driver.session(database_="neo4j") as session:
                    new_count, updated_count = session.execute_write(_run_bulk_entities)

            duration = time.time() - start_time
            self.logger.info(f"Bulk entity add completed ({duration:.2f}s). Processed: {processed_count}, Loaded New: {new_count}, Updated: {updated_count}, Failed: {len(failures)}")
            return {"new": new_count, "updated": updated_count, "failed": len(failures), "failures": failures}

        except Exception as e:
            duration = time.time() - start_time
            self.logger.error(f"Bulk entity add failed critically after {duration:.2f}s: {e}", exc_info=True)
            # Add all items in batch_data to failures if the whole query fails
            for item in batch_data:
                 failures.append({"data": item, "error": f"Bulk operation failed: {e}"})
            return {"new": 0, "updated": 0, "failed": len(failures), "failures": failures}

    def add_relationships_bulk(self, relationships: List[Dict], transaction=None) -> Dict:
        """Adds/updates relationships in bulk using UNWIND and MERGE."""
        if not relationships: return {"loaded": 0, "failed": 0, "failures": []}

        batch_data = []
        failures = []
        processed_count = 0
        start_time = time.time()

        for rel_dict in relationships:
            processed_count += 1
            try:
                source_hints = rel_dict.get('source_hints')
                target_hints = rel_dict.get('target_hints')
                rel_type = rel_dict.get('type')
                if not source_hints or not target_hints or not rel_type:
                    raise ValueError("Missing 'source_hints', 'target_hints', or 'type'.")

                properties = {k: v for k, v in rel_dict.items() if k not in ['source_hints', 'target_hints', 'type']}
                properties['updated_at'] = datetime.now().isoformat() # Consistent timestamp

                # Resolve nodes using hints (assuming MERGE logic in query)
                # We need the merge keys for source and target nodes
                def get_merge_keys(hints):
                    keys = {}
                    if 'mizId' in hints: keys['mizId'] = hints['mizId']
                    elif hints.get('platform') and hints.get('original_id'):
                        keys['platform'] = hints['platform']
                        keys['original_id'] = hints['original_id']
                    elif hints.get('email'): keys['email'] = hints['email']
                    if not keys: raise ValueError("Insufficient hints for node merge.")
                    return keys, hints.get('type', 'Entity')

                source_merge_keys, source_label = get_merge_keys(source_hints)
                target_merge_keys, target_label = get_merge_keys(target_hints)
                safe_rel_type = "".join(c if c.isalnum() or c == '_' else '_' for c in rel_type)

                batch_data.append({
                    "source_keys": source_merge_keys, "source_label": source_label,
                    "target_keys": target_merge_keys, "target_label": target_label,
                    "rel_type": safe_rel_type,
                    "props": properties
                })
            except Exception as e:
                failures.append({"data": rel_dict, "error": str(e)})

        if not batch_data:
             duration = time.time() - start_time
             self.logger.error(f"Bulk relationship add failed: No valid data in batch after processing {processed_count} items ({duration:.2f}s).")
             return {"loaded": 0, "failed": len(failures), "failures": failures}

        # Cypher query using UNWIND
        # Again, assumes merging nodes based on mizId primarily for simplicity
        query = """
        UNWIND $batch AS item
        MERGE (a {mizId: item.source_keys.mizId}) // Assumes mizId merge
        MERGE (b {mizId: item.target_keys.mizId}) // Assumes mizId merge
        // Ensure labels exist (optional, depends on node merge strategy)
        // CALL apoc.create.addLabels(a, [item.source_label]) YIELD node AS nodeA
        // CALL apoc.create.addLabels(b, [item.target_label]) YIELD node AS nodeB
        MERGE (a)-[r:`%s`]->(b) // Use variable rel type - CAUTION: Ensure safe_rel_type is safe
        ON CREATE SET r = item.props, r.created_at = item.props.updated_at
        ON MATCH SET r += item.props, r.updated_at = item.props.updated_at
        RETURN count(r) AS loadedCount
        """ % (item.rel_type) # Inject rel type - CAUTION

        # MIZ 3.0 TODO: More robust query handling dynamic merge keys and relationship types safely.
        # Consider separate queries for different node merge strategies or using APOC.

        params = {"batch": batch_data}
        loaded_count = 0

        try:
            def _run_bulk_rels(tx):
                result = tx.run(query, params)
                # Sum counts if query returns multiple rows (though this one shouldn't)
                count = sum(record["loadedCount"] for record in result)
                return count

            if transaction:
                loaded_count = _run_bulk_rels(transaction)
            else:
                with self._driver.session(database_="neo4j") as session:
                    loaded_count = session.execute_write(_run_bulk_rels)

            duration = time.time() - start_time
            # Note: loaded_count from MERGE doesn't distinguish new vs updated, just ensures existence.
            self.logger.info(f"Bulk relationship add completed ({duration:.2f}s). Processed: {processed_count}, Loaded/Updated: {loaded_count}, Failed: {len(failures)}")
            return {"loaded": loaded_count, "failed": len(failures), "failures": failures}

        except Exception as e:
            duration = time.time() - start_time
            self.logger.error(f"Bulk relationship add failed critically after {duration:.2f}s: {e}", exc_info=True)
            for item in batch_data:
                 failures.append({"data": item, "error": f"Bulk operation failed: {e}"})
            return {"loaded": 0, "failed": len(failures), "failures": failures}

    def get_entity(self, mizId: str) -> Optional[Dict]:
        """Retrieve an entity by its mizId."""
        query = "MATCH (n {mizId: $mizId}) RETURN properties(n) AS props"
        params = {"mizId": mizId}
        result = self.execute_query(query, params)
        return result[0]['props'] if result else None

    def get_neighbors(self, mizId: str, relationship_type: Optional[str] = None, direction: str = "both") -> List[Dict]:
        """Get neighbors of a node."""
        if direction == "outgoing": arrow = "-[r]->"
        elif direction == "incoming": arrow = "<-[r]-"
        else: arrow = "-[r]-" # both

        rel_match = f":`{relationship_type}`" if relationship_type else ""

        query = f"""
        MATCH (a {{mizId: $mizId}}){arrow}(b)
        WHERE $rel_type IS NULL OR type(r) = $rel_type
        RETURN b.mizId AS neighborId, properties(b) AS neighborProps, type(r) AS relationshipType, properties(r) AS relationshipProps
        """
        params = {"mizId": mizId, "rel_type": relationship_type}
        return self.execute_query(query, params)

    def find_path(self, start_node_hints: Dict, end_node_hints: Dict, relationship_types: Optional[List[str]] = None, max_depth: int = 5) -> Optional[List[Dict]]:
        """Finds the shortest path between two nodes."""
        try:
            start_merge, start_params = self._build_merge_clause(start_node_hints, 'a')
            end_merge, end_params = self._build_merge_clause(end_node_hints, 'b')

            rel_filter = "*"
            if relationship_types:
                 safe_rels = ["`" + "".join(c if c.isalnum() or c == '_' else '_' for c in rt) + "`" for rt in relationship_types]
                 rel_filter = "|".join(safe_rels)

            query = f"""
            MATCH (a:{start_node_hints.get('type','Entity')}), (b:{end_node_hints.get('type','Entity')})
            WHERE a.mizId = $start_mizId AND b.mizId = $end_mizId // Assuming hints resolved to mizId
            CALL apoc.algo.dijkstra(a, b, '{rel_filter}', 'cost', {max_depth}) YIELD path, weight
            RETURN path, weight
            LIMIT 1
            """
            # MIZ 3.0 TODO: This requires resolving hints to mizIds first, or a more complex MATCH.
            # Simpler version using shortestPath (no weights, requires APOC for variable rel types):
            query_shortest = f"""
            MATCH (a), (b)
            WHERE a.mizId = $start_mizId AND b.mizId = $end_mizId
            MATCH p = shortestPath((a)-[:{rel_filter}*1..{max_depth}]-(b))
            RETURN p as path
            LIMIT 1
            """
            # Resolve hints to mizIds first
            start_mizId = self.find_entity_by_hints(start_node_hints)
            end_mizId = self.find_entity_by_hints(end_node_hints)
            if not start_mizId or not end_mizId:
                 self.logger.warning("Could not resolve start or end node for pathfinding.")
                 return None

            params = {"start_mizId": start_mizId, "end_mizId": end_mizId}
            result = self.execute_query(query_shortest, params) # Use shortestPath query

            if result and 'path' in result[0]:
                 # Process the path object (convert nodes/rels to dicts)
                 path_data = []
                 neo4j_path = result[0]['path']
                 # Check path type (might be list of nodes/rels or Path object)
                 if hasattr(neo4j_path, 'nodes') and hasattr(neo4j_path, 'relationships'):
                      nodes = [n._properties for n in neo4j_path.nodes]
                      rels = [r._properties for r in neo4j_path.relationships]
                      # Interleave nodes and relationships (approximate)
                      for i, node in enumerate(nodes):
                           path_data.append({"type": "node", **node})
                           if i < len(rels):
                                path_data.append({"type": "relationship", **rels[i]})
                      return path_data
                 else:
                      self.logger.warning(f"Pathfinding returned unexpected path format: {type(neo4j_path)}")
                      return None # Or try to parse differently
            return None
        except Exception as e:
            self.logger.error(f"Error finding path: {e}")
            return None

    def get_schema(self) -> Dict:
        """Get graph schema (labels, relationship types, properties)."""
        try:
            # Using APOC for detailed schema is recommended
            # query = "CALL apoc.meta.schema() YIELD value RETURN value"
            # Fallback using built-ins
            labels_query = "CALL db.labels() YIELD label RETURN collect(label) AS labels"
            rels_query = "CALL db.relationshipTypes() YIELD relationshipType RETURN collect(relationshipType) AS relationshipTypes"

            labels = self.execute_query(labels_query)[0]['labels']
            rel_types = self.execute_query(rels_query)[0]['relationshipTypes']

            # Get properties (sample-based, less reliable than APOC)
            properties = {}
            # Sample properties for first 10 labels/rels
            # for label in labels[:10]:
            #     prop_query = f"MATCH (n:`{label}`) WITH n LIMIT 1 UNWIND keys(n) AS key RETURN collect(distinct key) AS props"
            #     props = self.execute_query(prop_query)
            #     if props: properties[label] = props[0]['props']
            # Similar logic for relationship types

            return {"labels": labels, "relationship_types": rel_types, "properties": "Use APOC for detailed properties"}
        except Exception as e:
            self.logger.error(f"Error getting schema: {e}")
            return {"error": str(e)}

    def get_stats(self) -> Dict:
        """Get basic graph statistics."""
        try:
            nodes_query = "MATCH (n) RETURN count(n) AS nodeCount"
            rels_query = "MATCH ()-[r]->() RETURN count(r) AS relationshipCount"
            node_count = self.execute_query(nodes_query)[0]['nodeCount']
            rel_count = self.execute_query(rels_query)[0]['relationshipCount']
            return {"nodes": node_count, "edges": rel_count}
        except Exception as e:
            self.logger.error(f"Error getting stats: {e}")
            return {"nodes": -1, "edges": -1, "error": str(e)}

    # --- Vector Index Methods (Placeholders) ---
    def create_vector_index(self, index_name: str, node_label: str, property_name: str, dimensions: int, similarity_fn: str = 'cosine'):
        """Creates a vector index in Neo4j."""
        # MIZ 3.0 TODO: Implement using `CALL db.index.vector.createNodeIndex(...)`
        self.logger.warning(f"Vector index creation for '{index_name}' not fully implemented (Placeholder).")
        query = f"""
        CALL db.index.vector.createNodeIndex(
            '{index_name}',     // Index name
            '{node_label}',     // Node label
            '{property_name}',  // Node property storing embeddings
            {dimensions},       // Embedding dimension
            '{similarity_fn}'   // Similarity function
        )
        """
        try:
            self.execute_query(query)
            self.logger.info(f"Attempted to create vector index '{index_name}'.")
        except Exception as e:
            # Ignore if index already exists, log other errors
            if "already exists" in str(e).lower():
                 self.logger.info(f"Vector index '{index_name}' likely already exists.")
            else:
                 self.logger.error(f"Failed to create vector index '{index_name}': {e}")
                 # Don't raise, allow continuation

    def add_node_embedding(self, mizId: str, embedding: List[float], index_name: str, transaction=None):
        """Adds or updates a node's embedding property for indexing."""
        # MIZ 3.0 TODO: Ensure property name matches index definition.
        # This method just sets the property; actual indexing might happen via `db.index.vector.addNode` or automatically.
        # For simplicity, we just set the property here.
        self.logger.warning(f"add_node_embedding only sets property (Placeholder). Ensure '{index_name}' uses this property.")
        query = """
        MATCH (n {mizId: $mizId})
        SET n.embedding = $embedding, n.updated_at = datetime()
        """
        params = {"mizId": mizId, "embedding": embedding}
        try:
            def _run_set_embedding(tx):
                tx.run(query, params)

            if transaction:
                _run_set_embedding(transaction)
            else:
                with self._driver.session(database_="neo4j") as session:
                    session.execute_write(_run_set_embedding)
            return True
        except Exception as e:
            self.logger.error(f"Failed to set embedding for node {mizId}: {e}")
            return False

    def search_by_vector(self, query_vector: List[float], index_name: str, k: int = 5) -> List[Tuple[str, float]]:
        """Searches the vector index."""
        # MIZ 3.0 TODO: Implement using `CALL db.index.vector.queryNodes(...)`
        self.logger.warning(f"Vector search for index '{index_name}' not fully implemented (Placeholder).")
        query = f"""
        CALL db.index.vector.queryNodes('{index_name}', $k, $queryVector) YIELD node, score
        RETURN node.mizId AS mizId, score
        """
        params = {"k": k, "queryVector": query_vector}
        try:
            results = self.execute_query(query, params)
            return [(r['mizId'], r['score']) for r in results]
        except Exception as e:
            self.logger.error(f"Vector search failed for index '{index_name}': {e}")
            return []

# --- Enhanced Self-Healing Knowledge Graph (Uses Adapter) ---
class EnhancedSelfHealingKG:
    """
    Manages the knowledge graph lifecycle, using a GraphStorageAdapter.
    Removes direct NetworkX dependency for core storage.
    """
    def __init__(self, config: 'EnhancedConfig', adapter: GraphStorageAdapter): # Inject adapter
        self.config = config
        self.adapter = adapter # Use the injected adapter
        self.logger = logging.getLogger('MIZ-OKI.EnhancedSelfHealingKG')

        # Indexer might still be useful for quick lookups not covered by DB indexes, or can be phased out.
        self.indexer = SimpleIndexer()
        self.anomaly_detector = PredictiveAnomalyDetector(self) # Pass self (KG instance)
        self.pseudonymizer = DataPseudonymizer(config.get("pseudonymization_salt", "default_salt"))

        self.entity_cache = {} # Simple in-memory cache (MIZ 3.0 TODO: Use proper cache like Redis)
        self.cache_ttl = timedelta(minutes=config.get("kg_cache_ttl_minutes", 5))
        self.cache_last_cleared = datetime.now()

        self.logger.info(f"EnhancedSelfHealingKG initialized with adapter: {adapter.__class__.__name__}")

    def _clear_cache_if_needed(self):
        if datetime.now() - self.cache_last_cleared > self.cache_ttl:
            self.entity_cache.clear()
            self.cache_last_cleared = datetime.now()
            self.logger.info("KG entity cache cleared.")

    def add_entity(self, entity_dict: Dict, source: str, transaction=None) -> Dict:
        """Adds/updates an entity using the adapter, updates indexer."""
        self._clear_cache_if_needed()
        # Pseudonymize sensitive data before adding
        entity_dict_processed = self.pseudonymizer.pseudonymize_dict(entity_dict)
        hints = entity_dict_processed.get('_resolution_hints')

        # Add/Update via adapter
        result = self.adapter.add_entity(entity_dict_processed, source, transaction=transaction)

        if result.get("success"):
            mizId = result["mizId"]
            entity_type = hints.get('type')
            # Update indexer (consider if still needed with DB indexes)
            self.indexer.index_entity(mizId, entity_type, entity_dict_processed)
            # Invalidate cache for this entity
            self.entity_cache.pop(mizId, None)
        return result

    def add_relationship(self, rel_dict: Dict, transaction=None) -> bool:
        """Adds/updates a relationship using the adapter."""
        # Pseudonymize properties if needed
        rel_dict_processed = self.pseudonymizer.pseudonymize_dict(rel_dict)
        return self.adapter.add_relationship(rel_dict_processed, transaction=transaction)

    def add_entities_bulk(self, entities: List[Dict], source: str, transaction=None) -> Dict:
        """Adds/updates entities in bulk using the adapter."""
        self._clear_cache_if_needed()
        processed_entities = [self.pseudonymizer.pseudonymize_dict(e) for e in entities]
        results = self.adapter.add_entities_bulk(processed_entities, source, transaction=transaction)
        # MIZ 3.0 TODO: Update indexer in bulk? Or rely on DB indexes primarily.
        # Invalidate relevant parts of the cache if bulk update is significant.
        return results

    def add_relationships_bulk(self, relationships: List[Dict], transaction=None) -> Dict:
        """Adds/updates relationships in bulk using the adapter."""
        processed_rels = [self.pseudonymizer.pseudonymize_dict(r) for r in relationships]
        return self.adapter.add_relationships_bulk(processed_rels, transaction=transaction)

    def get_entity(self, mizId: str, use_cache=True) -> Optional[Dict]:
        """Retrieves an entity by mizId using the adapter, with caching."""
        self._clear_cache_if_needed()
        if use_cache and mizId in self.entity_cache:
            self.logger.debug(f"Cache hit for entity {mizId}")
            return self.entity_cache[mizId]

        entity_props = self.adapter.get_entity(mizId)
        if entity_props:
             # Depseudonymize if needed (assuming get_entity returns raw props)
             entity_depseudonymized = self.pseudonymizer.depseudonymize_dict(entity_props)
             if use_cache:
                  self.entity_cache[mizId] = entity_depseudonymized
             return entity_depseudonymized
        return None

    def find_entity_by_hints(self, hints: Dict, transaction=None) -> Optional[str]:
        """Finds an entity's mizId by hints using the adapter."""
        return self.adapter.find_entity_by_hints(hints, transaction=transaction)

    def get_neighbors(self, mizId: str, relationship_type: Optional[str] = None, direction: str = "both") -> List[Dict]:
        """Gets neighbors using the adapter."""
        return self.adapter.get_neighbors(mizId, relationship_type, direction)

    def find_path(self, start_node_hints: Dict, end_node_hints: Dict, relationship_types: Optional[List[str]] = None, max_depth: int = 5) -> Optional[List[Dict]]:
         """Finds path using the adapter."""
         return self.adapter.find_path(start_node_hints, end_node_hints, relationship_types, max_depth)

    def get_schema(self) -> Dict:
        """Gets schema using the adapter."""
        return self.adapter.get_schema()

    def get_stats(self) -> Dict:
        """Gets stats using the adapter."""
        return self.adapter.get_stats()

    @contextmanager
    def transaction(self):
        """Provides a transactional context via the adapter."""
        with self.adapter.transaction() as tx:
            yield tx # Yield the adapter's transaction object

    # --- Vector/Embedding Methods ---
    def set_entity_embedding(self, mizId: str, embedding: List[float], index_name: str, transaction=None):
        """Sets the embedding property for an entity via the adapter."""
        # MIZ 3.0 TODO: Ensure index_name corresponds to a configured index.
        return self.adapter.add_node_embedding(mizId, embedding, index_name, transaction=transaction)

    def find_similar_entities(self, query_vector: List[float], index_name: str, k: int = 5) -> List[Tuple[str, float]]:
        """Finds similar entities using vector search via the adapter."""
        return self.adapter.search_by_vector(query_vector, index_name, k)

    # --- Self-Healing ---
    def detect_and_heal_anomalies(self, limit_per_type=100) -> int:
        """Detects and attempts to heal anomalies using the AnomalyDetector."""
        self.logger.info("Starting anomaly detection and healing cycle...")
        start_time = time.time()
        fixed_count = self.anomaly_detector.run_detection_and_healing(limit=limit_per_type)
        duration = time.time() - start_time
        self.logger.info(f"Anomaly detection and healing cycle finished in {duration:.2f}s. Issues potentially fixed: {fixed_count}")
        return fixed_count

    def close(self):
        """Close the adapter connection."""
        self.adapter.close()

# --- Predictive Anomaly Detector (Refactored for Cypher) ---
class PredictiveAnomalyDetector:
    """Detects anomalies in the KG using Cypher queries via the adapter."""
    def __init__(self, kg_instance: EnhancedSelfHealingKG):
        self.kg = kg_instance # Instance of EnhancedSelfHealingKG
        self.logger = logging.getLogger('MIZ-OKI.AnomalyDetector')

    def run_detection_and_healing(self, limit=100) -> int:
        """Runs all detection checks and attempts healing."""
        total_fixed = 0
        anomaly_checks = [
            self._check_orphaned_entities,
            self._check_missing_core_attributes,
            # Add more checks: _check_temporal_inconsistencies, _check_schema_violations, etc.
        ]
        for check_func in anomaly_checks:
            try:
                anomalies = check_func(limit)
                if anomalies:
                    self.logger.warning(f"Detected {len(anomalies)} anomalies via {check_func.__name__}.")
                    fixed = self._attempt_healing(anomalies, check_func.__name__)
                    total_fixed += fixed
            except Exception as e:
                self.logger.error(f"Error during anomaly check {check_func.__name__}: {e}")
        return total_fixed

    def _execute_detection_query(self, query: str, params: Dict, description: str) -> List[Dict]:
        """Helper to execute a detection query via the KG adapter."""
        self.logger.debug(f"Running detection query: {description}")
        try:
            return self.kg.adapter.execute_query(query, params)
        except Exception as e:
            self.logger.error(f"Failed to execute detection query '{description}': {e}")
            return []

    def _check_orphaned_entities(self, limit=100) -> List[Dict]:
        """Finds nodes with no relationships."""
        query = """
        MATCH (n)
        WHERE NOT (n)--() AND NOT n:Metadata AND NOT n:Schema // Exclude helper nodes
        RETURN n.mizId AS mizId, labels(n) AS types
        LIMIT $limit
        """
        params = {"limit": limit}
        return self._execute_detection_query(query, params, "Orphaned Entities")

    def _check_missing_core_attributes(self, limit=100) -> List[Dict]:
        """Finds entities missing essential attributes (example: 'name' for Campaign)."""
        # MIZ 3.0 TODO: Define core attributes per type in config or schema
        query = """
        MATCH (c:Campaign)
        WHERE c.name IS NULL OR c.name = ''
        RETURN c.mizId AS mizId, 'Campaign' AS entityType, 'name' AS missingAttribute
        LIMIT $limit
        """
        params = {"limit": limit}
        results = self._execute_detection_query(query, params, "Missing Core Attributes (Campaign.name)")
        # Add more checks for other types...
        return results

    # MIZ 3.0 TODO: Implement temporal checks, schema violation checks, etc.

    def _attempt_healing(self, anomalies: List[Dict], check_type: str) -> int:
        """Attempts to heal detected anomalies (basic placeholder)."""
        fixed_count = 0
        for anomaly in anomalies:
            mizId = anomaly.get('mizId')
            if not mizId: continue
            self.logger.info(f"Attempting to heal anomaly ({check_type}) for entity {mizId}...")
            healed = False
            try:
                # --- Healing Logic Placeholder ---
                if check_type == "_check_orphaned_entities":
                    # Strategy: Delete orphan if old and low importance, else flag for review.
                    entity = self.kg.get_entity(mizId)
                    if entity:
                         created_at_str = entity.get('created_at')
                         # Example: Delete if older than 30 days and source is 'temp'
                         # if created_at_str and (datetime.now() - datetime.fromisoformat(created_at_str)).days > 30 and entity.get('source') == 'temp':
                         #     self.kg.adapter.execute_query("MATCH (n {mizId: $mizId}) DETACH DELETE n", {"mizId": mizId})
                         #     healed = True
                         # else:
                         self.logger.warning(f"Orphaned entity {mizId} requires review.")
                         # MIZ 3.0 TODO: Flag for review via Orchestrator/Human Interface
                elif check_type == "_check_missing_core_attributes":
                    # Strategy: Try to infer missing attribute or flag for review.
                    entity_type = anomaly.get('entityType')
                    missing_attr = anomaly.get('missingAttribute')
                    # Example: Infer campaign name from related AdGroup if possible
                    # neighbors = self.kg.get_neighbors(mizId, relationship_type='CONTAINS', direction='outgoing')
                    # inferred_value = ...
                    # if inferred_value:
                    #     self.kg.adapter.execute_query("MATCH (n {mizId: $mizId}) SET n.%s = $value" % missing_attr, {"mizId": mizId, "value": inferred_value})
                    #     healed = True
                    # else:
                    self.logger.warning(f"Entity {mizId} ({entity_type}) missing core attribute '{missing_attr}'. Requires review.")
                    # MIZ 3.0 TODO: Flag for review

                # --- End Healing Logic ---
                if healed:
                    fixed_count += 1
                    self.logger.info(f"Successfully applied healing action for {mizId}.")
                else:
                    self.logger.debug(f"No automatic healing action taken for {mizId}.")

            except Exception as e:
                self.logger.error(f"Error attempting to heal anomaly for {mizId}: {e}")

        return fixed_count

# --- Simple Indexer (May become less critical with DB indexes) ---
class SimpleIndexer:
    """Basic in-memory index for quick lookups (consider replacing with DB indexes)."""
    def __init__(self):
        self.type_index = defaultdict(set)
        self.attribute_index = defaultdict(lambda: defaultdict(set)) # attr -> value -> {id1, id2}
        self.logger = logging.getLogger('MIZ-OKI.SimpleIndexer')

    def index_entity(self, entity_id, entity_type, properties):
        if not entity_id or not entity_type: return
        self.type_index[entity_type].add(entity_id)
        for key, value in properties.items():
            # Index simple, hashable values
            if isinstance(value, (str, int, float, bool)) and value is not None:
                try:
                     # Limit indexed value length for strings
                     value_to_index = str(value)[:256] if isinstance(value, str) else value
                     self.attribute_index[key][value_to_index].add(entity_id)
                except TypeError: # Handle unhashable types gracefully
                     self.logger.debug(f"Could not index attribute '{key}' with value type {type(value)}")


    def remove_entity(self, entity_id, entity_type, properties):
        # MIZ 3.0 TODO: Implement removal logic if using this indexer actively.
        pass

    def search_by_type(self, entity_type):
        return list(self.type_index.get(entity_type, set()))

    def search_by_attribute(self, attribute_key, attribute_value):
         # Limit indexed value length for strings
         value_to_search = str(attribute_value)[:256] if isinstance(attribute_value, str) else attribute_value
         return list(self.attribute_index.get(attribute_key, {}).get(value_to_search, set()))

# --- Data Pseudonymizer ---
class DataPseudonymizer:
    """Handles pseudonymization of sensitive data using a salt."""
    def __init__(self, salt: str):
        if not salt or salt == "default_salt" or salt == "default_insecure_salt_replace_me_!!":
            logger.critical("CRITICAL SECURITY RISK: Using default or insecure salt for pseudonymization. SET MIZ_SALT ENV VAR.")
            # In production, raise an error or exit. For notebook, proceed with warning.
            # raise ValueError("Insecure salt detected. Set MIZ_SALT environment variable securely.")
        self.salt = salt.encode('utf-8')
        # MIZ 3.0 TODO: Define which fields are sensitive in config
        self.sensitive_fields = {"email", "phone", "ip_address", "customer_name", "user_id"} # Example set

    def _hash(self, value: str) -> str:
        """Generates a salted hash of the value."""
        return hashlib.sha256(self.salt + str(value).encode('utf-8')).hexdigest()

    def pseudonymize_value(self, key: str, value: Any) -> Any:
        """Pseudonymizes a single value if the key is sensitive."""
        if key in self.sensitive_fields and isinstance(value, str) and value:
            return f"pseudo_{self._hash(value)[:16]}" # Prefix and truncate hash
        return value

    def pseudonymize_dict(self, data: Dict) -> Dict:
        """Recursively pseudonymizes sensitive fields in a dictionary."""
        if not isinstance(data, dict): return data
        pseudonymized = {}
        for key, value in data.items():
            if isinstance(value, dict):
                pseudonymized[key] = self.pseudonymize_dict(value)
            elif isinstance(value, list):
                pseudonymized[key] = [self.pseudonymize_dict(item) if isinstance(item, dict) else self.pseudonymize_value(key, item) for item in value]
            else:
                pseudonymized[key] = self.pseudonymize_value(key, value)
        return pseudonymized

    def depseudonymize_dict(self, data: Dict) -> Dict:
        """Placeholder for depseudonymization (requires secure storage of original values - NOT IMPLEMENTED)."""
        # WARNING: True depseudonymization requires storing the original values securely,
        # mapped to their pseudonymized versions. This is complex and has security implications.
        # This function currently does NOT reverse the pseudonymization.
        # self.logger.warning("Depseudonymization called, but it's a placeholder and does not reverse hashing.")
        return data # Return data as is

# --- Initialization ---
eshkg = None
if 'config' in locals() and config: # Check if EnhancedConfig instance exists
    if config.get("kg_storage_type") == "neo4j":
        if NEO4J_AVAILABLE:
            try:
                neo4j_adapter = Neo4jAdapter(config)
                eshkg = EnhancedSelfHealingKG(config, neo4j_adapter)
                print("--- MIZ 3.0 Knowledge Graph Layer Initialized (Neo4j Backend) ---")
                # Optional: Create vector index on initialization
                # try:
                #     eshkg.adapter.create_vector_index("entity_embeddings", "Entity", "embedding", 768) # Example dimension
                # except Exception as index_e:
                #     logger.error(f"Failed to create initial vector index: {index_e}")
            except (ImportError, ValueError, ConnectionRefusedError, ConnectionError) as e:
                print(f"--- MIZ 3.0 KG Initialization FAILED (Neo4j): {e} ---")
                logger.critical(f"Neo4j KG Initialization failed: {e}", exc_info=True)
                # Fallback or halt execution
        else:
            print("--- MIZ 3.0 KG Initialization FAILED: Neo4j configured but library not installed. ---")
            logger.critical("Neo4j library not installed.")
    else:
        # MIZ 3.0 TODO: Implement other adapters (e.g., MemoryGraphAdapter for testing)
        print(f"--- MIZ 3.0 KG Initialization SKIPPED: Unsupported kg_storage_type '{config.get('kg_storage_type')}'. ---")
        logger.warning(f"Unsupported kg_storage_type: {config.get('kg_storage_type')}")
else:
    print("--- MIZ 3.0 KG Initialization SKIPPED: Configuration not available. ---")
    logger.error("EnhancedConfig 'config' instance not found. Cannot initialize KG.")

# Example Usage (requires eshkg to be initialized)
# if eshkg:
#     print("\nTesting KG Operations...")
#     # Add entity
#     add_result = eshkg.add_entity({
#         "_resolution_hints": {"type": "Customer", "email": "test@example.com"},
#         "name": "Test User",
#         "email": "test@example.com", # Will be pseudonymized
#         "status": "active"
#     }, source="test_script")
#     print(f"Add Entity Result: {add_result}")
#
#     if add_result.get("success"):
#         customer_id = add_result["mizId"]
#         # Get entity
#         retrieved_customer = eshkg.get_entity(customer_id)
#         print(f"Retrieved Customer: {retrieved_customer}") # Email should be pseudonymized
#
#         # Add another entity and relationship
#         order_result = eshkg.add_entity({
#             "_resolution_hints": {"type": "Order", "platform": "shopify", "original_id": "ORD123"},
#             "platform": "shopify", "original_id": "ORD123", "total": 99.99
#         }, source="test_script")
#         if order_result.get("success"):
#             order_id = order_result["mizId"]
#             rel_success = eshkg.add_relationship({
#                 "source_hints": {"type": "Customer", "mizId": customer_id},
#                 "target_hints": {"type": "Order", "mizId": order_id},
#                 "type": "PLACED_ORDER",
#                 "order_date": datetime.now().date().isoformat()
#             })
#             print(f"Add Relationship Result: {rel_success}")
#
#             # Get neighbors
#             neighbors = eshkg.get_neighbors(customer_id)
#             print(f"Neighbors of {customer_id}: {neighbors}")
#
#     # Get Stats
#     stats = eshkg.get_stats()
#     print(f"Graph Stats: {stats}")
#
#     # Run anomaly detection
#     fixed_anomalies = eshkg.detect_and_heal_anomalies()
#     print(f"Anomalies Fixed: {fixed_anomalies}")
#
#     # Remember to close connection when done
#     # eshkg.close()


ERROR:MIZ-OKI.KnowledgeGraph:EnhancedConfig 'config' instance not found. Cannot initialize KG.


CRITICAL: 'neo4j' library not found. Install (`pip install neo4j`) for KG functionality.
--- MIZ 3.0 KG Initialization SKIPPED: Configuration not available. ---


In [35]:
# Cell 4: Foundational Layer Implementation (MIZ 3.0 OKI - Reworked for B.O.S.S. & MoA)
# Status: AKA enhanced with B.O.S.S. loop structure. MoEManager uses Vertex endpoints. DataFlow integrates MoA Comms. AKA inherits EnhancedBaseAgent.

import os
import numpy as np
import pandas as pd
import tensorflow as tf
# from tensorflow.keras import layers, models, optimizers
import datetime
import json
import logging
import time
import random
import requests
from google.cloud import aiplatform
from google.cloud import exceptions as gcp_exceptions
import asyncio # Added for async agent
from typing import Dict, Any, Optional, List, Union # Added Union

# --- MoA/Orchestrator Dependency ---
# Import the new MoA system components from Cell 15
# Assuming Cell 15 defines: EnhancedBaseAgent, UnifiedCommunicationSystem, AgentMessage, MessageType
# If running standalone, use placeholders:
try:
    from cell15 import EnhancedBaseAgent, UnifiedCommunicationSystem, AgentMessage, MessageType, MIZ_MoA_System
except ImportError:
    logging.warning("Could not import MoA components from Cell 15. Using placeholders.")
    class EnhancedBaseAgent: # Basic Placeholder
        def __init__(self, agent_id, config, communication_system, knowledge_graph, capabilities=None):
            self.agent_id = agent_id; self.communication = communication_system; self.logger = logging.getLogger(agent_id)
        async def initialize(self): pass
        async def run(self): pass
        async def cleanup(self): pass
        async def send_message(self, *args, **kwargs): self.logger.debug("Placeholder send_message")
    class UnifiedCommunicationSystem: pass
    class AgentMessage: pass
    class MessageType: TASK_ASSIGNMENT = 1; TASK_RESULT = 2 # Dummy values
    class MIZ_MoA_System: pass # Placeholder

# --- Other Dependencies ---
# Assume these are available or use placeholders
# from cell1 import EnhancedConfig, CONFIG
# from cell3 import EnhancedSelfHealingKG
# from cell5 import KnowledgeUpdate # Used by AKA
# from cell7 import PrivacyControls # Used by DataFlowManager
# from cell8 import KnowledgeDistillation, DistributedReinforcementLearning # Used by AKA
# from cell18 import FoundationModelClient

# --- Placeholder Dependencies ---
class PlaceholderKG: pass
class PlaceholderKU: pass
class PlaceholderPC: pass
class PlaceholderKD: pass
class PlaceholderFMClient: pass
class PlaceholderMoEManager: # Updated Placeholder
    expert_registry = {}
    def register_expert(self, expert_id, **kwargs): self.expert_registry[expert_id] = kwargs; logging.info(f"Mock MoE: Registered {expert_id}")
    def invoke_expert(self, expert_id, *args, **kwargs): logging.debug(f"Mock MoE: Invoked {expert_id}"); return {"prediction": [random.random()]}
# --- End Placeholders ---

logger = logging.getLogger('MIZ-OKI.FoundationalLayer')

# --- Mixture of Experts Manager (Remains largely the same, uses Vertex AI) ---
class MixtureOfExpertsManager:
    """
    Manages a dynamic collection of expert models (MIZ 3.0 MoE Pillar).
    Handles registration, routing (MVP: basic), invocation via Vertex AI.
    """
    def __init__(self, config: Dict): # Use Dict for broader compatibility if EnhancedConfig not available
        self.config = config
        self.project_id = config.get("project_id", "dummy-project") # Get from dict
        self.region = config.get("region", "us-central1") # Get from dict
        self.expert_registry = {}
        self.routing_model = None
        self.logger = logging.getLogger('MIZ-OKI.MoEManager')
        self.logger.info("Mixture of Experts Manager initialized.")
        # MIZ 3.0 TODO: Load initial registry from persistent storage

    def register_expert(self, expert_id, task_type, domain, model_resource_name=None, endpoint_name=None, description="", prediction_params=None):
        """Registers or updates an expert model deployed on Vertex AI."""
        if not endpoint_name:
             self.logger.error(f"Cannot register expert {expert_id}: Vertex AI endpoint_name is required.")
             return False

        if endpoint_name and not endpoint_name.startswith("projects/"):
             endpoint_name = f"projects/{self.project_id}/locations/{self.region}/endpoints/{endpoint_name}"
        if model_resource_name and not model_resource_name.startswith("projects/"):
             model_resource_name = f"projects/{self.project_id}/locations/{self.region}/models/{model_resource_name}"

        prediction_endpoint_uri = None
        if endpoint_name:
             try:
                  api_endpoint = f"{self.region}-aiplatform.googleapis.com"
                  prediction_endpoint_uri = f"https://{api_endpoint}/v1/{endpoint_name}:predict"
             except Exception as e:
                  self.logger.warning(f"Could not derive prediction URI for endpoint {endpoint_name}: {e}")

        self.expert_registry[expert_id] = {
            "task_type": task_type, "domain": domain, "model_resource_name": model_resource_name,
            "endpoint_name": endpoint_name, "prediction_endpoint_uri": prediction_endpoint_uri,
            "description": description, "status": "active", "registered_at": datetime.datetime.now().isoformat(),
            "prediction_params": prediction_params or {}
        }
        self.logger.info(f"Expert '{expert_id}' registered/updated. Endpoint: {endpoint_name}")
        return True

    def invoke_expert(self, expert_id, input_data):
        """Invokes expert via Vertex AI Endpoint."""
        if expert_id not in self.expert_registry:
            self.logger.error(f"Expert '{expert_id}' not found in registry.")
            return None
        expert_meta = self.expert_registry[expert_id]
        endpoint_resource_name = expert_meta.get("endpoint_name")
        prediction_params = expert_meta.get("prediction_params")

        if not endpoint_resource_name:
            self.logger.error(f"No Vertex AI endpoint configured for expert '{expert_id}'.")
            return None

        try:
            # Ensure aiplatform is initialized (should be done globally or passed)
            if 'aiplatform' not in globals() or not hasattr(aiplatform, 'Endpoint'):
                 raise RuntimeError("Vertex AI SDK (aiplatform) not initialized or available.")

            endpoint = aiplatform.Endpoint(endpoint_name=endpoint_resource_name)

            if isinstance(input_data, dict) and 'instances' in input_data: instances = input_data['instances']
            elif isinstance(input_data, np.ndarray): instances = input_data.tolist()
            elif isinstance(input_data, pd.DataFrame): instances = input_data.to_dict(orient='records')
            elif isinstance(input_data, list): instances = input_data
            elif isinstance(input_data, dict): instances = [input_data]
            else: self.logger.error(f"Unsupported input data type: {type(input_data)}"); return None
            if not instances: self.logger.warning("No instances provided."); return []

            self.logger.info(f"Invoking expert {expert_id} via endpoint {endpoint_resource_name}...")
            prediction = endpoint.predict(instances=instances, parameters=prediction_params)
            self.logger.info(f"Received prediction from expert {expert_id}.")
            return getattr(prediction, 'predictions', getattr(prediction, '_pb', prediction))

        except gcp_exceptions.NotFound: self.logger.error(f"Vertex AI Endpoint not found: {endpoint_resource_name}"); return None
        except Exception as e: self.logger.error(f"Error invoking expert {expert_id}: {e}", exc_info=True); return None

    # Other MoEManager methods (route_request, trigger_expert_update, etc.) remain similar

# --- Data Flow Manager (Refactored for MoA Comms) ---
class DataFlowManager:
    """
    Manages secure and intelligent data flows between domains/agents.
    Integrates with MoA Communication System instead of direct dispatch.
    """
    def __init__(self, knowledge_graph: Any, privacy_controls: Any, communication_system: Optional[UnifiedCommunicationSystem]): # Use MoA Comms
        self.kg = knowledge_graph
        self.privacy_controls = privacy_controls
        self.communication = communication_system # Store communication system
        self.domains = {}
        self.flows = {}
        self.sync_logs = deque(maxlen=1000)
        self.logger = logging.getLogger('MIZ-OKI.DataFlowManager')
        if not self.communication:
             self.logger.warning("CommunicationSystem not provided. Cannot trigger agent tasks.")
        self.logger.info("Data Flow Manager initialized (MoA Integrated).")

    def register_domain(self, domain_id, domain_name, data_sources=None, privacy_profile="default"):
        self.domains[domain_id] = {"name": domain_name, "data_sources": data_sources or [], "privacy_profile": privacy_profile}
        self.logger.info(f"Registered domain: {domain_id}")

    def define_data_flow(self, flow_id, source_domain, target_domain, data_type,
                         trigger_condition=None, transformation_logic=None,
                         target_agent_id=None, target_agent_task=None): # Keep target agent info
        if source_domain not in self.domains or target_domain not in self.domains:
            raise ValueError("Source or target domain not registered.")
        self.flows[flow_id] = {
            "source_domain": source_domain, "target_domain": target_domain, "data_type": data_type,
            "trigger_condition": trigger_condition, "transformation_logic": transformation_logic,
            "target_agent_id": target_agent_id, "target_agent_task": target_agent_task, # Store target info
            "defined_at": datetime.datetime.now().isoformat()
        }
        self.logger.info(f"Defined data flow: {flow_id}")

    async def execute_flow(self, flow_id, data=None, context=None):
        """Executes a data flow, triggering agents via the Communication System."""
        if flow_id not in self.flows: raise ValueError(f"Flow ID '{flow_id}' not found.")
        flow = self.flows[flow_id]
        context = context or {}; context['flow_id'] = flow_id
        log_entry = {"flow_id": flow_id, "timestamp": datetime.datetime.now().isoformat(), "status": "started", "context": context}

        try:
            # Steps 1-4: Trigger, Fetch, Privacy, Transform (remain conceptually similar, use async if needed)
            # ... (Assume data fetching and processing logic here) ...
            fetched_data = data or [{"id": 1, "value": "sample"}] # Placeholder fetch
            processed_data = fetched_data # Placeholder processing

            # Step 5: Trigger Agent via Communication System
            final_data = processed_data
            log_entry["final_data_count"] = len(final_data)
            result_payload = {}

            target_agent_id = flow.get("target_agent_id")
            target_task_type = flow.get("target_agent_task")

            if target_agent_id and target_task_type and self.communication:
                self.logger.info(f"Flow {flow_id}: Sending task '{target_task_type}' to agent '{target_agent_id}' via Communication System.")
                message_content = {"task_type": target_task_type, "input_data": final_data, "source_flow": flow_id}
                message = AgentMessage(
                    sender=f"DataFlowManager:{flow_id}",
                    receiver=target_agent_id,
                    message_type=MessageType.TASK_ASSIGNMENT,
                    content=message_content,
                    context=context,
                    trace_id=context.get("trace_id") # Propagate trace ID if available
                )
                await self.communication.send_message(message)
                log_entry["triggered_agent_task"] = target_task_type
                log_entry["triggered_agent_id"] = target_agent_id
                log_entry["triggered_message_id"] = message.id
                result_payload = {"triggered_message_id": message.id}
                self.logger.info(f"Flow {flow_id}: Sent message {message.id}.")
            elif not self.communication:
                 self.logger.error(f"Flow {flow_id}: Cannot trigger agent, CommunicationSystem unavailable.")
                 raise RuntimeError("CommunicationSystem unavailable")
            else:
                self.logger.info(f"Flow {flow_id}: No target agent/task defined. Delivery step skipped.")
                result_payload = {"delivered_data_count": log_entry["final_data_count"]}

            log_entry["status"] = "success"
            self.sync_logs.append(log_entry)
            return {"success": True, "status": "success", **result_payload}

        except Exception as e:
            self.logger.error(f"Error executing flow {flow_id}: {e}", exc_info=True)
            log_entry["status"] = "failed"; log_entry["error"] = str(e)
            self.sync_logs.append(log_entry)
            return {"success": False, "error": str(e)}

    # get_flow_status, get_sync_logs remain similar

# --- Autonomous Knowledge Agent (Refactored for MoA/EnhancedBaseAgent) ---
class AutonomousKnowledgeAgent(EnhancedBaseAgent):
    """
    Autonomously enriches the KG via external research, experimentation, and proactive discovery.
    Implements the B.O.S.S. self-teaching loop. Inherits from EnhancedBaseAgent.
    """
    def __init__(self, agent_id: str, config: Dict, communication_system: UnifiedCommunicationSystem,
                 knowledge_graph: Any, knowledge_updater: Any, # Pass KU
                 foundation_model_client: Optional[Any] = None, # Use Any for flexibility
                 knowledge_distiller: Optional[Any] = None,
                 moe_manager: Optional[MixtureOfExpertsManager] = None,
                 capabilities: List[str] = None):
        super().__init__(agent_id, config, communication_system, knowledge_graph,
                         capabilities or ["run_discovery", "run_boss_cycle", "design_experiment", "launch_experiment", "analyze_experiment"])
        self.knowledge_updater = knowledge_updater # Store KU
        self.fm_client = foundation_model_client
        self.knowledge_distiller = knowledge_distiller
        self.moe_manager = moe_manager
        self.discovery_monitors = {}
        self.experiments = {}
        self.agent_history = deque(maxlen=500)
        if not self.fm_client: self.logger.warning("FoundationModelClient not provided.")
        if not self.knowledge_distiller: self.logger.warning("KnowledgeDistiller not provided.")
        if not self.moe_manager: self.logger.warning("MoEManager not provided.")
        self.logger.info("Autonomous Knowledge Agent (MoA Integrated) initialized.")

    async def process_task(self, task_type: str, task_details: Dict[str, Any]) -> Any:
        """Handles tasks assigned via messages."""
        if task_type == "run_discovery":
            await self.run_discovery_cycle()
            return {"status": "discovery_cycle_complete"}
        elif task_type == "run_boss_cycle":
            await self.run_boss_cycle()
            return {"status": "boss_cycle_complete"}
        elif task_type == "design_experiment":
            exp_id = await self.design_experiment(**task_details) # Pass details as kwargs
            return {"status": "experiment_designed", "experiment_id": exp_id}
        # Add handlers for launch/analyze experiment tasks
        else:
            raise NotImplementedError(f"AKA does not support task type: {task_type}")

    # --- Research & Discovery (Methods become async, use await for IO/LLM calls) ---
    async def add_discovery_monitor(self, monitor_id, source_type, query, frequency_hours, processing_pipeline):
        # Logic remains similar, just ensure logging is correct
        self.discovery_monitors[monitor_id] = {
            "source_type": source_type, "query": query, "frequency_hours": frequency_hours,
            "processing_pipeline": processing_pipeline, "last_checked": None,
            "created_at": datetime.datetime.now().isoformat()
        }
        self.logger.info(f"Added discovery monitor: {monitor_id}")

    async def run_discovery_cycle(self):
        self.logger.info("Starting external discovery cycle...")
        now = datetime.datetime.now()
        triggered_count = 0
        for monitor_id, monitor in self.discovery_monitors.items():
            # Check frequency logic remains same
            should_run = False
            if monitor["last_checked"] is None or now - datetime.datetime.fromisoformat(monitor["last_checked"]) >= datetime.timedelta(hours=monitor["frequency_hours"]):
                 should_run = True

            if should_run:
                self.logger.info(f"Running discovery monitor: {monitor_id}")
                run_log = {"monitor_id": monitor_id, "timestamp": now.isoformat(), "status": "started"}
                triggered_count += 1
                try:
                    # Make external fetching async
                    raw_findings = await self._fetch_external_data(monitor["source_type"], monitor["query"])
                    monitor["last_checked"] = now.isoformat()

                    if raw_findings:
                        run_log["findings_count"] = len(raw_findings)
                        # Make processing async
                        processed_insights = await self._process_findings(raw_findings, monitor.get("processing_pipeline", []))
                        run_log["processed_count"] = len(processed_insights)
                        if processed_insights:
                            # Make integration async
                            integration_results = await self._integrate_insights(processed_insights, f"discovery:{monitor_id}")
                            run_log["insights_integrated"] = integration_results.get("integrated_count", 0)
                            run_status = "success"
                        else: run_status = "no_insights_processed"
                    else: run_status = "no_new_findings"
                    run_log["status"] = run_status
                except Exception as e:
                    self.logger.error(f"Error running monitor {monitor_id}: {e}", exc_info=True)
                    run_log["status"] = "error"; run_log["error"] = str(e)
                self.agent_history.append(run_log)
        self.logger.info(f"External discovery cycle finished. Triggered {triggered_count} monitors.")

    async def _fetch_external_data(self, source_type, query):
        """Fetches data asynchronously using aiohttp."""
        self.logger.debug(f"Fetching external data async: Type={source_type}, Query='{query}'")
        findings = []
        try:
            # Use aiohttp for async HTTP requests
            async with aiohttp.ClientSession() as session:
                if source_type == 'web_search':
                    self.logger.warning("Web search fetching requires a real search API integration.")
                    findings = [{"title": f"Simulated Async Result for {query} 1", "link": "http://example.com/1", "content_preview": "Async content..."}]
                elif source_type == 'api':
                    self.logger.warning("News API fetching requires API key configuration.")
                    findings = [{"title": f"Simulated Async News about {query}", "link": "http://example.com/news", "content_preview": "Async market trends..."}]
                else:
                    self.logger.warning(f"Unsupported external source type: {source_type}")
        except aiohttp.ClientError as e:
            self.logger.error(f"HTTP error fetching external data async for '{query}': {e}")
        except Exception as e:
            self.logger.error(f"Error fetching external data async for '{query}': {e}")
        return findings

    async def _process_findings(self, findings, pipeline):
        """Processes findings asynchronously using FM Client."""
        processed = []
        if not self.fm_client: return []
        model_alias = self.config.get("aka_processing_model_alias", "llama4_maverick") # Use config key

        async def process_single(finding):
            text_content = finding.get('content_preview') or finding.get('snippet') or finding.get('title')
            if not text_content: return None
            current_data = finding.copy()
            try:
                for step in pipeline:
                    # Assume fm_client methods are async or wrap sync calls
                    if step == 'summarize':
                        # summary = await self.fm_client.summarize_async(...) # If async method exists
                        summary = await asyncio.to_thread(self.fm_client.summarize, text_content, max_length=100, model_alias=model_alias) # Wrap sync call
                        current_data['summary_ai'] = summary
                    elif step == 'extract_entities':
                        entities = await asyncio.to_thread(self.fm_client.extract_entities, text_content, model_alias=model_alias)
                        current_data['entities_ai'] = entities
                    # Add other async steps
                return current_data
            except Exception as e:
                 self.logger.error(f"Error processing finding async '{finding.get('title', 'N/A')}': {e}")
                 return None

        tasks = [process_single(f) for f in findings]
        results = await asyncio.gather(*tasks)
        return [r for r in results if r is not None]

    async def _integrate_insights(self, insights: List[Dict], source_prefix: str) -> Dict:
        """Integrates insights asynchronously via KnowledgeUpdater."""
        if not self.knowledge_updater: return {"integrated_count": 0, "failed_count": len(insights)}
        kg_updates = []
        for insight in insights:
            entity_dict = {
                "_resolution_hints": {"type": "ExternalInsight", "link": insight.get("link")},
                "title": insight.get("title"), "link": insight.get("link"), "source_monitor": source_prefix,
                "summary_ai": insight.get("summary_ai"), "entities_ai": insight.get("entities_ai"),
                "processed_at": datetime.datetime.now().isoformat()
            }
            kg_updates.append(entity_dict)
        if not kg_updates: return {"integrated_count": 0, "failed_count": 0}

        self.logger.info(f"Integrating {len(kg_updates)} insights async into KG from {source_prefix}...")
        # Assume knowledge_updater.process_updates is sync, wrap it
        results = await asyncio.to_thread(self.knowledge_updater.process_updates, kg_updates, source=source_prefix)
        integrated_count = sum(1 for r in results if r.get("success"))
        failed_count = len(results) - integrated_count
        self.logger.info(f"Async integration complete. Successful: {integrated_count}, Failed: {failed_count}")
        return {"integrated_count": integrated_count, "failed_count": failed_count}

    # --- B.O.S.S. Self-Teaching Loop (Methods become async) ---
    async def run_boss_cycle(self):
        self.logger.info("Starting B.O.S.S. self-teaching cycle...")
        try:
            gaps = await self._identify_knowledge_gaps()
            if not gaps: self.logger.info("No significant knowledge gaps identified."); return
            selected_gap = gaps[0]
            self.logger.info(f"Selected knowledge gap: {selected_gap.get('description')}")

            research_findings = await self._trigger_subagent_research(selected_gap.get('description'))
            if not research_findings: self.logger.warning("SubAgent research yielded no findings."); return

            mini_model_info = await self._generate_mini_model(research_findings, selected_gap.get('potential_task', 'Generic task'))
            if not mini_model_info: self.logger.error("Failed to generate mini-model."); return

            is_valid = await self._validate_mini_model(mini_model_info.get('model_path'), validation_data=None)
            if not is_valid: self.logger.warning(f"Mini-model {mini_model_info.get('expert_id')} failed validation."); return

            deploy_success = await self._deploy_and_register_mini_model(
                model_path=mini_model_info.get('model_path'), expert_id=mini_model_info.get('expert_id'),
                domain=selected_gap.get('domain', 'general'), task_type=mini_model_info.get('task_type', 'classification')
            )
            if deploy_success: self.logger.info(f"Successfully deployed/registered mini-model: {mini_model_info.get('expert_id')}")
            else: self.logger.error(f"Failed to deploy/register mini-model: {mini_model_info.get('expert_id')}")
        except Exception as e:
            self.logger.error(f"Error during B.O.S.S. cycle: {e}", exc_info=True)

    async def _identify_knowledge_gaps(self) -> List[Dict]:
        self.logger.debug("Identifying knowledge gaps async...")
        # Wrap sync logic or implement async KG queries
        await asyncio.sleep(0.1) # Simulate async work
        if random.random() < 0.1:
             return [{"description": "Async gap: Lack of model for engagement score.", "domain": "engagement", "potential_task": "Predict score"}]
        return []

    async def _trigger_subagent_research(self, gap_description: str) -> List[Dict]:
        self.logger.debug(f"Triggering async sub-agent research for: {gap_description}")
        # Use async fetch/process methods
        query = f"data for {gap_description}"
        findings = await self._fetch_external_data(source_type='web_search', query=query)
        processed_findings = await self._process_findings(findings, ['summarize', 'extract_entities'])
        return processed_findings

    async def _generate_mini_model(self, research_findings: List[Dict], task_description: str) -> Optional[Dict]:
        if not self.knowledge_distiller or not self.fm_client: return None
        self.logger.info(f"Generating mini-model async for task: {task_description}")
        try:
            # Dataset prep remains similar (sync for now)
            dataset = {"inputs": [f.get('summary_ai', '') for f in research_findings if f.get('summary_ai')],
                       "targets": [len(f.get('entities_ai', [])) for f in research_findings if f.get('summary_ai')]}
            if not dataset["inputs"]: return None

            # Model building remains sync
            # MIZ 3.0 TODO: Use MiniModel class if defined elsewhere
            student_model = tf.keras.Sequential([tf.keras.layers.Dense(10)]) # Dummy model
            model_id = f"mini_model_{uuid.uuid4().hex[:8]}"

            # Wrap sync distillation call
            success = await asyncio.to_thread(
                self.knowledge_distiller.distill_knowledge,
                student_model=student_model, dataset=dataset, distillation_params={"epochs": 1} # Minimal epochs
            )

            if success:
                model_path = f"/tmp/mini_models/{model_id}"
                # Wrap sync save call
                await asyncio.to_thread(student_model.save, model_path, save_format='tf')
                self.logger.info(f"Generated and saved mini-model async to {model_path}")
                return {"expert_id": model_id, "model_path": model_path, "task_type": "regression"}
            else: return None
        except Exception as e:
            self.logger.error(f"Error generating mini-model async: {e}", exc_info=True)
            return None

    async def _validate_mini_model(self, model_path: str, validation_data=None) -> bool:
        self.logger.debug(f"Validating mini-model async at {model_path} (Placeholder).")
        # Wrap sync validation logic
        await asyncio.sleep(0.1) # Simulate async work
        return True # Placeholder

    async def _deploy_and_register_mini_model(self, model_path: str, expert_id: str, domain: str, task_type: str) -> bool:
        if not self.communication or not self.moe_manager: return False # Check communication system now
        self.logger.info(f"Requesting async deployment for mini-model: {expert_id}")
        pipeline_task_data = {"model_source_path": model_path, "expert_id": expert_id, "task_type": task_type, "domain": domain}
        try:
            # Send message to trigger MLOps pipeline
            await self.send_message(
                 receiver="MLOpsAgent", # Hypothetical agent ID
                 msg_type=MessageType.TASK_ASSIGNMENT,
                 content={"task_type": "deploy_expert_model", **pipeline_task_data},
                 context={"trigger_agent": self.agent_id}
            )
            self.logger.info(f"Deployment task message sent for {expert_id}.")

            # --- Placeholder for getting deployment results ---
            # In MoA, the MLOps agent would send a TASK_RESULT message back.
            # This agent would need to handle that message to get deployment details.
            # For now, simulate success and register with placeholder details.
            await asyncio.sleep(2) # Simulate pipeline run time
            model_resource_name_placeholder = f"projects/{self.config.get('project_id')}/locations/{self.config.get('region')}/models/{expert_id}_deployed"
            endpoint_resource_name_placeholder = f"projects/{self.config.get('project_id')}/locations/{self.config.get('region')}/endpoints/{domain}_endpoint"
            # --- End Placeholder ---

            reg_success = self.moe_manager.register_expert(
                expert_id=expert_id, task_type=task_type, domain=domain,
                model_resource_name=model_resource_name_placeholder, endpoint_name=endpoint_resource_name_placeholder
            )
            return reg_success
        except Exception as e:
            self.logger.error(f"Error during async mini-model deployment/registration for {expert_id}: {e}")
            return False

    # --- Autonomous Experimentation (Methods become async) ---
    async def design_experiment(self, goal_description, target_metric, control_group_query, treatment_variations):
        self.logger.info(f"Designing experiment async for goal: {goal_description}")
        exp_id = f"exp_{uuid.uuid4().hex[:8]}"
        await asyncio.sleep(0.1) # Simulate async work
        self.experiments[exp_id] = {
            "goal": goal_description, "metric": target_metric, "status": "designed",
            "control_query": control_group_query, "variants": treatment_variations,
            "start_date": None, "end_date": None, "results": None
        }
        self.logger.info(f"Experiment {exp_id} designed async.")
        return exp_id

    async def launch_experiment(self, experiment_id):
        if experiment_id not in self.experiments: return False
        self.experiments[experiment_id]["status"] = "running"
        self.experiments[experiment_id]["start_date"] = datetime.datetime.now().isoformat()
        self.logger.info(f"Launching experiment {experiment_id} async (Placeholder - requires integration).")
        # MIZ 3.0 TODO: Send message to trigger experiment execution agent
        await asyncio.sleep(0.1)
        return True

    async def analyze_experiment_results(self, experiment_id):
        if experiment_id not in self.experiments or self.experiments[experiment_id]["status"] != "running": return None
        self.logger.info(f"Analyzing results async for experiment {experiment_id} (Placeholder).")
        await asyncio.sleep(0.2) # Simulate analysis
        simulated_results = {var: {"metric_value": random.uniform(0.05, 0.15)} for var in self.experiments[experiment_id]["variants"]}
        simulated_results["control"] = {"metric_value": random.uniform(0.04, 0.10)}
        winner = max(simulated_results, key=lambda k: simulated_results[k]["metric_value"])
        self.experiments[experiment_id]["status"] = "completed"
        self.experiments[experiment_id]["end_date"] = datetime.datetime.now().isoformat()
        self.experiments[experiment_id]["results"] = {"winner": winner, "details": simulated_results}
        self.logger.info(f"Experiment {experiment_id} analysis complete async. Winner: {winner}")
        # MIZ 3.0 TODO: Send message to LearningIntegration
        return self.experiments[experiment_id]["results"]

    # get_agent_status, get_history remain synchronous helper methods

# --- Initialization ---
# Ensure dependencies are initialized correctly from other cells/placeholders
_eshkg = eshkg if 'eshkg' in locals() else PlaceholderKG()
_knowledge_updater = knowledge_updater if 'knowledge_updater' in locals() else PlaceholderKU()
_config = CONFIG if 'CONFIG' in locals() else {}
_foundation_model_client = foundation_model_client if 'foundation_model_client' in locals() else PlaceholderFMClient()
_knowledge_distiller = kd if 'kd' in locals() else PlaceholderKD()
_privacy_controls = privacy_controls if 'privacy_controls' in locals() else PlaceholderPC()
# MoA system provides communication, no need for separate orchestrator reference here
_communication_system = miz_moa_system.communication_system if 'miz_moa_system' in locals() and miz_moa_system else None
_moe_manager = moe_manager if 'moe_manager' in locals() else PlaceholderMoEManager()

# Instantiate Foundational Layer Components (MoA Integrated)
# MoEManager is instantiated here, but might be better managed globally or passed via config
moe_manager_instance = _moe_manager or MixtureOfExpertsManager(_config)

# DataFlowManager now takes the communication system from MoA
data_flow_manager_instance = None
if _eshkg and _privacy_controls and _communication_system:
    data_flow_manager_instance = DataFlowManager(_eshkg, _privacy_controls, _communication_system)
else:
    logger.warning("Cannot initialize DataFlowManager: Missing KG, PrivacyControls, or CommunicationSystem.")

# AutonomousKnowledgeAgent is created by the AgentFactory within MIZ_MoA_System
# We don't instantiate it directly here anymore.

print("--- MIZ 3.0 Foundational Layer Definitions Updated (MoA Integrated) ---")
if moe_manager_instance: print(f"MoEManager: Instantiated.")
else: print("MoEManager: Failed to instantiate.")
if data_flow_manager_instance: print(f"DataFlowManager: Instantiated (uses MoA Comms).")
else: print("DataFlowManager: Failed to instantiate.")
print("AutonomousKnowledgeAgent: Definition updated to inherit EnhancedBaseAgent (instantiated by MoA System).")
print("----------------------------------------------------------------")

# Example Usage (Conceptual - Triggering AKA via MoA message)
# async def trigger_aka_discovery():
#     if 'miz_moa_system' in locals() and miz_moa_system and miz_moa_system.initialized:
#         aka_agent_id = next((aid for aid, agent in miz_moa_system.agents.items() if isinstance(agent, AutonomousKnowledgeAgent)), None)
#         if aka_agent_id:
#             print("\nSending 'run_discovery' task to AutonomousKnowledgeAgent via MoA...")
#             message = AgentMessage(sender="system_test", receiver=aka_agent_id, message_type=MessageType.TASK_ASSIGNMENT, content={"task_type": "run_discovery"})
#             await miz_moa_system.communication_system.send_message(message)
#             print("Message sent.")
#         else:
#             print("AutonomousKnowledgeAgent not found in MoA system.")
#     else:
#         print("MoA system not initialized.")
#
# # To run the example trigger:
# # asyncio.run(trigger_aka_discovery())



--- MIZ 3.0 Foundational Layer Definitions Updated (MoA Integrated) ---
MoEManager: Instantiated.
DataFlowManager: Failed to instantiate.
AutonomousKnowledgeAgent: Definition updated to inherit EnhancedBaseAgent (instantiated by MoA System).
----------------------------------------------------------------


In [36]:
# Cell 5: Core Processes Layer Implementation (MIZ 3.0 OKI - Reworked for MoA)
# Status: Integrates LLaMA 4 via FM Client. Connects to MoA Comms. Causal/Simulation placeholders added. LI triggers MLOps via message. PO uses FM forecasting. AGG/SCF trigger tasks via message.

import time
import datetime
import logging
import numpy as np
import pandas as pd
import random
import json
import uuid # Added for IDs
from typing import Dict, Any, Optional, List, Union, Callable
from collections import deque, defaultdict # Added deque

# --- MoA/Orchestrator Dependency ---
# Import the new MoA system components from Cell 15
# Assuming Cell 15 defines: EnhancedBaseAgent, UnifiedCommunicationSystem, AgentMessage, MessageType
try:
    from cell15 import EnhancedBaseAgent, UnifiedCommunicationSystem, AgentMessage, MessageType, MIZ_MoA_System
except ImportError:
    logging.warning("Could not import MoA components from Cell 15. Using placeholders.")
    # Add placeholders similar to Cell 4 if needed

# --- Other Dependencies ---
# Assume these are available or use placeholders
# from cell1 import EnhancedConfig, CONFIG
# from cell3 import EnhancedSelfHealingKG
# from cell4 import MixtureOfExpertsManager # Defined in Cell 4 now
# from cell7 import AdaptiveWorkflowEvolution # Conceptually linked
# from cell18 import FoundationModelClient

# --- Placeholder Dependencies ---
class PlaceholderKG:
    async def find_entity_by_hints(self, *args, **kwargs): logger.debug("PlaceholderKG.find_entity_by_hints"); return f"resolved_{random.randint(1000,9999)}"
    async def get_entity(self, *args, **kwargs): logger.debug("PlaceholderKG.get_entity"); return {"type": "placeholder", "mizId": args[0]}
    async def add_entity(self, *args, **kwargs): logger.debug("PlaceholderKG.add_entity"); return {"success": True, "mizId": f"ent_{random.randint(1000,9999)}", "is_new": True}
class PlaceholderMoEManager:
    expert_registry = {"roas_forecaster_v1": {}}
    async def invoke_expert(self, expert_id, *args, **kwargs): logger.debug(f"PlaceholderMoE.invoke_expert async for {expert_id}"); return {"prediction": [random.random()]}
class PlaceholderFMClient:
    async def generate_text(self, prompt, *args, **kwargs): logger.debug("PlaceholderFMClient.generate_text async"); return f"LLaMA4 async response to: {prompt[:50]}..."
class PlaceholderWorkflowEvolver: pass
class PlaceholderCommunicationSystem:
     async def send_message(self, *args, **kwargs): logger.debug("PlaceholderComms.send_message")
# --- End Placeholders ---

logger = logging.getLogger('MIZ-OKI.CoreProcesses')

# --- Causal/Simulation Module Placeholders (Remain Similar, potentially async) ---
class CausalReasoningModule:
    def __init__(self, kg): self.kg = kg; self.logger = logging.getLogger('MIZ-OKI.CausalModule')
    async def estimate_effect(self, query, context): # Made async
        self.logger.info(f"Simulating async causal query: {query}")
        await asyncio.sleep(0.1) # Simulate async work
        return {"effect_size": random.uniform(-0.2, 0.3), "confidence": random.uniform(0.6, 0.9), "_info": "Simulated Async Causal Result"}

class SimulationModule:
    def __init__(self, kg, fm_client): self.kg = kg; self.fm_client = fm_client; self.logger = logging.getLogger('MIZ-OKI.SimulationModule')
    async def run_scenario(self, scenario_config, context): # Made async
        self.logger.info(f"Running async simulation scenario: {scenario_config.get('name', 'unnamed')}")
        await asyncio.sleep(0.2) # Simulate async work
        base_roas = context.get("current_metrics", {}).get("roas", 3.0)
        sim_roas = base_roas * random.uniform(0.9, 1.15)
        return {"predicted_roas": sim_roas, "_info": f"Simulated async outcome for scenario {scenario_config.get('name')}"}

# --- Enhanced Knowledge Update (KU Pillar - Async) ---
class KnowledgeUpdate:
    """ Manages async KG updates, validation, conflict resolution. """
    def __init__(self, knowledge_graph: Any, conflict_resolver: 'ConflictResolution'):
        self.kg = knowledge_graph
        self.conflict_resolver = conflict_resolver
        self.update_history = deque(maxlen=5000)
        self.validation_rules = defaultdict(dict)
        self.update_rules = defaultdict(dict)
        self.logger = logging.getLogger('MIZ-OKI.KnowledgeUpdate')
        self.logger.info("Knowledge Update process initialized (Async).")

    # register_validation_rule, register_update_rule remain synchronous setup methods

    async def process_updates(self, updates: List[Dict], source: str) -> List[Dict]:
        """ Processes a batch of updates asynchronously. """
        results = []
        if not isinstance(updates, list): return [{"success": False, "error": "Input 'updates' must be a list."}]

        # Process updates concurrently
        tasks = [self._process_single_update(update_data, source) for update_data in updates]
        results = await asyncio.gather(*tasks, return_exceptions=True)

        # Handle potential exceptions returned by gather
        final_results = []
        for i, res in enumerate(results):
            if isinstance(res, Exception):
                hints = updates[i].get("_resolution_hints", {})
                self.logger.error(f"Unhandled exception processing update for {hints}: {res}", exc_info=res)
                final_results.append({"success": False, "hints": hints, "status": "failed", "error": str(res)})
            else:
                final_results.append(res)
        return final_results

    async def _process_single_update(self, update_data: Dict, source: str) -> Dict:
        """ Process a single update item asynchronously. """
        if not isinstance(update_data, dict): return {"success": False, "error": "Update item is not a dictionary."}
        hints = update_data.get("_resolution_hints"); attributes = {k: v for k, v in update_data.items() if not k.startswith('_')}
        entity_type_hint = hints.get("type") if hints else None
        if not hints or not entity_type_hint: return {"success": False, "error": "Missing '_resolution_hints' or 'type'."}
        if not attributes: return {"success": False, "hints": hints, "error": "Missing attributes."}

        log_entry = {"timestamp": datetime.datetime.now().isoformat(), "source": source, "input_hints": hints, "status": "pending", "validations": [], "rules_applied": [], "conflicts_resolved": []}

        try:
            # Use async KG methods
            resolved_mizId = await self.kg.find_entity_by_hints(hints)
            current_entity = await self.kg.get_entity(resolved_mizId) if resolved_mizId else None
            is_new_entity = current_entity is None
            entity_id = resolved_mizId or hints.get("mizId"); entity_type = entity_type_hint
            log_entry["entity_id"] = entity_id; log_entry["entity_type"] = entity_type; log_entry["is_new"] = is_new_entity

            # Validation (assuming rules are sync for now, wrap if needed)
            validation_passed = True
            if entity_type in self.validation_rules:
                for rule_id, rule_func in self.validation_rules[entity_type].items():
                    is_valid, message = rule_func((current_entity or {}).copy(), attributes.copy())
                    log_entry["validations"].append({"rule_id": rule_id, "passed": is_valid, "message": message})
                    if not is_valid: validation_passed = False; break
            if not validation_passed: raise ValueError(f"Validation failed: {log_entry['validations'][-1]['message']}")

            # Update Rules (assuming sync)
            modified_attributes = attributes.copy()
            # ... (apply update rules logic) ...

            # Conflict Resolution (make async)
            potential_conflicts = await self.conflict_resolver.detect_conflicts_for_update(hints, current_entity, modified_attributes)
            final_attributes = modified_attributes.copy()
            if potential_conflicts:
                self.logger.warning(f"{len(potential_conflicts)} conflicts for {hints}. Resolving async...")
                resolution_tasks = [self.conflict_resolver.resolve_conflict(c) for c in potential_conflicts]
                resolution_results = await asyncio.gather(*resolution_tasks)
                log_entry["conflicts_resolved"] = resolution_results
                for res in resolution_results:
                    if res.get("success") and res.get("updated_attributes"): final_attributes.update(res["updated_attributes"])
                    elif not res.get("success"): raise RuntimeError(f"Unresolved conflict: {res.get('conflict', {}).get('conflict_type')}")

            # Commit (use async KG method)
            entity_to_commit = {"_resolution_hints": hints, **final_attributes}
            if 'type' not in entity_to_commit: entity_to_commit['type'] = entity_type
            commit_result = await self.kg.add_entity(entity_to_commit, source)

            if commit_result and commit_result.get("success"):
                log_entry["status"] = "success"; log_entry["entity_id"] = commit_result.get("mizId")
                result = {"success": True, "hints": hints, "mizId": commit_result.get("mizId"), "status": "success", "is_new": commit_result.get("is_new", is_new_entity)}
            else:
                error_msg = commit_result.get("error", "KG add_entity failed") if isinstance(commit_result, dict) else "KG add_entity failed"
                raise RuntimeError(f"KG commit failed: {error_msg}")

        except Exception as e:
            self.logger.error(f"Failed processing update for {hints}: {e}", exc_info=False)
            log_entry["status"] = "failed"; log_entry["error"] = str(e)
            result = {"success": False, "hints": hints, "status": "failed", "error": str(e)}

        self.update_history.append(log_entry)
        return result

    # get_update_history remains synchronous

# --- Enhanced Hybrid Decision Engine (DM Pillar - Async) ---
class HybridDecisionEngine:
    """ Makes decisions asynchronously using hybrid AI, LLaMA 4, ethical checks. """
    def __init__(self, knowledge_graph: Any, moe_manager: Any,
                 ethical_guardrails: 'EthicalGuardrailsEngine', fm_client: Optional[Any] = None):
        self.kg = knowledge_graph
        self.moe_manager = moe_manager
        self.ethical_guardrails = ethical_guardrails
        self.fm_client = fm_client
        self.decision_blueprints = {}
        self.decision_history = deque(maxlen=5000)
        self.causal_module = CausalReasoningModule(self.kg)
        self.simulation_module = SimulationModule(self.kg, self.fm_client)
        self.logger = logging.getLogger('MIZ-OKI.HybridDecisionEngine')
        self.logger.info("Hybrid Decision Engine initialized (Async).")

    # register_blueprint remains synchronous setup

    async def _execute_reasoning_module(self, module_config, context):
        """ Executes a reasoning module asynchronously. """
        module_type = module_config.get("type"); module_id = module_config.get("id", f"{module_type}_{random.randint(100,999)}")
        output = {"type": module_type, "id": module_id}; start_time = time.time()
        try:
            if module_type == "model":
                expert_id = module_config.get("expert_id"); inputs_keys = module_config.get("inputs")
                if not expert_id: raise ValueError("Missing 'expert_id'")
                model_input = {k: context[k] for k in inputs_keys if k in context} if inputs_keys else context
                # Assume moe_manager.invoke_expert is async or wrap it
                prediction = await self.moe_manager.invoke_expert(expert_id, model_input)
                if prediction is None: raise RuntimeError("Model invocation failed.")
                output["result"] = prediction; output["expert_id"] = expert_id
            elif module_type == "rule":
                # Rule execution likely remains sync, wrap if needed
                rule_set_id = module_config.get("rule_set_id")
                if not rule_set_id: raise ValueError("Missing 'rule_set_id'")
                await asyncio.sleep(0.01) # Simulate async work
                result = {"action": "no_change", "reason": "Default rule (Simulated)"}
                output["result"] = result; output["rule_set_id"] = rule_set_id
            elif module_type == "causal":
                query = module_config.get("query", "effect(X->Y)")
                output["result"] = await self.causal_module.estimate_effect(query, context)
                output["query"] = query
            elif module_type == "simulation":
                scenario_config = module_config.get("scenario_config", {"name": "default"})
                output["result"] = await self.simulation_module.run_scenario(scenario_config, context)
                output["scenario"] = scenario_config.get("name")
            elif module_type == "llama4_reasoning":
                if not self.fm_client: raise RuntimeError("FMClient unavailable.")
                prompt_template = module_config.get("prompt_template"); model_alias = module_config.get("model_alias", "llama4_maverick")
                if not prompt_template: raise ValueError("Missing 'prompt_template'")
                try: prompt = prompt_template.format(**context)
                except KeyError as fmt_e: raise ValueError(f"Missing context '{fmt_e}' for prompt.")
                # Assume fm_client.generate_text is async or wrap it
                llama_response = await self.fm_client.generate_text(prompt, model_alias=model_alias, max_tokens=512)
                if llama_response is None: raise RuntimeError("LLaMA 4 call failed.")
                output["result"] = {"raw_output": llama_response}; output["model_alias"] = model_alias
            else: output["error"] = "Unsupported module type"
        except Exception as mod_e: output["error"] = str(mod_e); self.logger.error(f"Module {module_id} error: {mod_e}", exc_info=False)
        output["duration_ms"] = (time.time() - start_time) * 1000
        return output

    # _aggregate_outputs remains synchronous logic

    async def make_decision(self, decision_type, context):
        """ Makes a decision asynchronously using the hybrid approach. """
        if decision_type not in self.decision_blueprints: return {"success": False, "error": "Blueprint not found"}
        blueprint = self.decision_blueprints[decision_type]; decision_id = f"dec_{decision_type}_{uuid.uuid4().hex[:12]}"
        log_entry = {"decision_id": decision_id, "decision_type": decision_type, "timestamp_start": datetime.datetime.now().isoformat(), "context": context, "status": "pending", "module_outputs": {}, "ethical_review": {}, "final_decision": None, "confidence": 0.0, "chain_of_thought": []}
        cot = log_entry["chain_of_thought"]

        try:
            cot.append(f"Start async decision '{decision_type}'. Context: {list(context.keys())}")
            # Context Validation (sync)
            # ... (validation logic) ...
            cot.append("Context validation passed.")

            # Execute Modules Concurrently
            cot.append("Executing reasoning modules concurrently...")
            module_tasks = [self._execute_reasoning_module(mod_cfg, context) for mod_cfg in blueprint.get("reasoning_modules", [])]
            module_results_list = await asyncio.gather(*module_tasks)
            module_outputs = {res.get("id", f"unknown_{i}"): res for i, res in enumerate(module_results_list)}
            log_entry["module_outputs"] = module_outputs
            for mod_id, mod_res in module_outputs.items():
                 status = "Success" if "error" not in mod_res else f"Failed ({mod_res['error']})"
                 cot.append(f"  - Module '{mod_id}' ({mod_res.get('type')}): {status}. Duration: {mod_res.get('duration_ms', 0):.0f}ms")
            cot.append("Reasoning modules execution complete.")

            # Aggregation (sync)
            aggregation_logic = blueprint.get("aggregation_logic", "prioritized")
            cot.append(f"Aggregating outputs ({aggregation_logic})...")
            aggregated_decision, aggregated_confidence, source_module_id, source_module_type = self._aggregate_outputs(module_outputs, aggregation_logic)
            if aggregated_decision is None: raise ValueError("Aggregation failed.")
            cot.append(f"Aggregation result: Decision={aggregated_decision}, Confidence={aggregated_confidence:.3f}, Source='{source_module_id}'")
            log_entry["aggregated_decision"] = aggregated_decision; log_entry["aggregated_confidence"] = aggregated_confidence
            log_entry["source_module_id"] = source_module_id; log_entry["source_module_type"] = source_module_type

            # Ethical Guardrails (make async if checks involve IO/LLM)
            cot.append("Performing ethical review...")
            ethical_review = await self.ethical_guardrails.review_decision(decision_type, context, aggregated_decision)
            log_entry["ethical_review"] = ethical_review
            cot.append(f"Ethical review: Approved={ethical_review.get('approved')}. Reason='{ethical_review.get('reason', '')}'")

            final_decision = aggregated_decision; final_confidence = aggregated_confidence
            if not ethical_review.get("approved", False):
                final_decision["ethics_flag"] = {"status": "review_needed", "reason": ethical_review.get('reason')}
                final_confidence *= 0.8; log_entry["status"] = "ethics_review_needed"
                cot.append(f"Decision flagged by ethics. Confidence adjusted: {final_confidence:.3f}.")
            else: log_entry["status"] = "approved_by_ethics"; cot.append("Ethical checks passed.")

            log_entry["final_decision"] = final_decision; log_entry["confidence"] = final_confidence
            cot.append(f"Final Decision: {final_decision}. Final Confidence: {final_confidence:.3f}")

            # Explanation Refs (sync)
            # ... (generate refs) ...
            log_entry["explanation_refs"] = [{"method": "chain_of_thought", "ref_id": f"cot_{decision_id}"}]
            cot.append(f"Explanation refs generated.")

            # Trigger Action (Log Only - Actual trigger via MoA message)
            min_confidence_for_action = self.config.get("decision_confidence_threshold", 0.85)
            is_actionable = final_decision.get("action") not in ["no_change", "raw_model_output", "causal_insight", "reasoned_output", "blocked"]
            if final_confidence >= min_confidence_for_action and is_actionable and log_entry["status"] != "ethics_review_needed":
                log_entry["action_triggered"] = True; self.logger.info(f"Decision {decision_id} meets threshold. Action required: {final_decision}")
                cot.append(f"Action required: {final_decision}")
                # MIZ 3.0: Instead of direct trigger, HDE should return the decision,
                # and the calling agent (e.g., BossAgent) sends the action message.
            else:
                 # ... (log reason for no action) ...
                 log_entry["action_triggered"] = False; cot.append("Action not triggered.")

            log_entry["status"] = "success"

        except Exception as e:
            self.logger.error(f"Async decision making failed for '{decision_type}' (ID: {decision_id}): {e}", exc_info=True)
            log_entry["status"] = "failed"; log_entry["error"] = str(e)
            cot.append(f"PROCESS FAILED: {e}")

        log_entry["timestamp_end"] = datetime.datetime.now().isoformat()
        self.decision_history.append(log_entry)
        return log_entry

    # get_decision_log, get_history remain synchronous

# --- Ethical Guardrails Engine (Async Review) ---
class EthicalGuardrailsEngine:
    """ Evaluates decisions asynchronously against ethical principles. """
    def __init__(self, config):
        self.config = config; self.checks = defaultdict(dict); self.logger = logging.getLogger('MIZ-OKI.EthicalGuardrails')
        self.logger.info("Ethical Guardrails Engine initialized (Async).")
        # MIZ 3.0 TODO: Load checks

    # register_check remains synchronous setup

    async def review_decision(self, decision_type, context, decision):
        """ Review a decision asynchronously against registered checks. """
        results = {"approved": True, "checks_passed": [], "checks_failed": [], "reason": "Checks passed."}
        if decision_type in self.checks:
            check_tasks = []
            for check_id, check_func in self.checks[decision_type].items():
                # Assume check_func might be async or wrap sync calls
                check_tasks.append(self._run_single_check(check_id, check_func, context, decision))

            check_results = await asyncio.gather(*check_tasks, return_exceptions=True)

            for i, res in enumerate(check_results):
                 check_id = list(self.checks[decision_type].keys())[i] # Get corresponding ID
                 if isinstance(res, Exception):
                      self.logger.error(f"Ethical check {check_id} failed with exception: {res}")
                      results["approved"] = False; failure_detail = {"check_id": check_id, "error": str(res)}
                      results["checks_failed"].append(failure_detail)
                      if results["reason"] == "Checks passed.": results["reason"] = f"Error in check {check_id}"
                 elif isinstance(res, tuple) and len(res) == 2:
                      is_approved, reason = res
                      if not is_approved:
                           results["approved"] = False; failure_detail = {"check_id": check_id, "reason": reason}
                           results["checks_failed"].append(failure_detail)
                           if results["reason"] == "Checks passed.": results["reason"] = reason
                           self.logger.warning(f"Ethical check failed ({decision_type}/{check_id}): {reason}")
                      else: results["checks_passed"].append(check_id)
                 else:
                      self.logger.error(f"Ethical check {check_id} returned invalid result format: {res}")
                      results["approved"] = False; failure_detail = {"check_id": check_id, "error": "Invalid result format"}
                      results["checks_failed"].append(failure_detail)
                      if results["reason"] == "Checks passed.": results["reason"] = f"Invalid result from {check_id}"

        if not results["approved"]: self.logger.warning(f"Decision '{decision_type}' failed ethical review. Reason: {results['reason']}")
        return results

    async def _run_single_check(self, check_id, check_func, context, decision):
        """Helper to run a single check, wrapping sync functions if needed."""
        if asyncio.iscoroutinefunction(check_func):
            return await check_func(context.copy(), decision.copy())
        else:
            # Wrap synchronous function call
            return await asyncio.to_thread(check_func, context.copy(), decision.copy())

# --- Enhanced Learning Integration (LI Pillar - Async & MoA Comms) ---
class LearningIntegration:
    """ Manages async integration of learning outcomes, triggers MLOps via messages. """
    def __init__(self, knowledge_graph: Any, moe_manager: Any,
                 knowledge_updater: Any, communication_system: Optional[UnifiedCommunicationSystem]): # Use MoA Comms
        self.kg = knowledge_graph
        self.moe_manager = moe_manager
        self.knowledge_updater = knowledge_updater
        self.communication = communication_system # Store comms system
        self.learning_history = deque(maxlen=5000)
        self.integration_rules = defaultdict(dict)
        self.bias_detectors = []
        self.logger = logging.getLogger('MIZ-OKI.LearningIntegration')
        if not self.communication: self.logger.warning("CommunicationSystem not provided. Cannot trigger retraining pipelines.")
        self.logger.info("Learning Integration process initialized (Async & MoA Integrated).")

    # register_integration_rule, register_bias_detector remain synchronous setup

    async def _run_bias_checks(self, knowledge_data, source):
        """ Runs bias detectors asynchronously. """
        bias_checks_log = []; bias_found = False
        if not self.bias_detectors: return bias_checks_log, bias_found
        self.logger.info(f"Running async bias checks on data from {source} (Placeholder)...")
        # Wrap sync detector calls or make detectors async
        async def run_detector(detector):
            try:
                report = await asyncio.to_thread(detector, knowledge_data) # Wrap sync call
                return {"detector": detector.__name__, "report": report}
            except Exception as bias_e:
                self.logger.error(f"Bias detector {detector.__name__} failed: {bias_e}", exc_info=False)
                return {"detector": detector.__name__, "error": str(bias_e)}

        results = await asyncio.gather(*(run_detector(d) for d in self.bias_detectors))
        for res in results:
             bias_checks_log.append(res)
             if res.get("report", {}).get("bias_detected"): bias_found = True
        if bias_found: self.logger.warning(f"Potential bias detected by one or more detectors.")
        return bias_checks_log, bias_found

    async def integrate_learning(self, knowledge_type, knowledge_data, source, importance=0.5):
        """ Integrates learning asynchronously, applying rules, bias checks, and triggering updates via messages. """
        integration_id = f"li_{knowledge_type}_{uuid.uuid4().hex[:12]}"
        log_entry = {"integration_id": integration_id, "timestamp": datetime.datetime.now().isoformat(), "knowledge_type": knowledge_type, "source": source, "importance": importance, "input_data": "...", "status": "pending", "bias_checks": [], "actions_taken": [], "triggered_messages": []} # Avoid logging large data

        try:
            bias_checks_log, bias_found = await self._run_bias_checks(knowledge_data, source)
            log_entry["bias_checks"] = bias_checks_log
            if bias_found: log_entry["bias_mitigation"] = "Flagged (MVP)"

            # Apply Integration Rules (sync for now)
            actions_to_take = []
            # ... (rule application logic) ...

            # Execute Actions Asynchronously
            executed_actions_summary = []
            action_tasks = [self._execute_action(action, integration_id, log_entry) for action in actions_to_take]
            action_results = await asyncio.gather(*action_tasks, return_exceptions=True)

            for i, res in enumerate(action_results):
                 action_type = actions_to_take[i].get("type", "unknown")
                 if isinstance(res, Exception):
                      self.logger.error(f"Error executing integration action {action_type}: {res}", exc_info=False)
                      executed_actions_summary.append({"type": action_type, "error": str(res)})
                 else:
                      executed_actions_summary.append(res) # Append the log dict returned by _execute_action

            log_entry["actions_taken"] = executed_actions_summary
            log_entry["status"] = "success"

        except Exception as e:
            self.logger.error(f"Async learning integration failed for '{knowledge_type}': {e}", exc_info=True)
            log_entry["status"] = "failed"; log_entry["error"] = str(e)

        self.learning_history.append(log_entry)
        return log_entry

    async def _execute_action(self, action: Dict, integration_id: str, log_entry_ref: Dict) -> Dict:
        """ Executes a single integration action asynchronously. """
        action_type = action.get("type"); action_log = {"type": action_type}
        try:
            if action_type == "update_kg":
                updates = action.get("updates")
                if updates:
                    # Use async KU
                    update_result = await self.knowledge_updater.process_updates(updates, f"li:{integration_id}")
                    action_log["result_summary"] = f"{sum(1 for r in update_result if r.get('success'))}/{len(update_result)} updates successful"
                else: action_log["error"] = "Missing 'updates'"
            elif action_type == "retrain_expert":
                expert_id = action.get("expert_id"); pipeline_params = action.get("pipeline_params", {})
                if expert_id and self.communication:
                    task_data = {"expert_id": expert_id, **pipeline_params}
                    message = AgentMessage(
                        sender=f"LearningIntegration:{integration_id}", receiver="MLOpsAgent", # Target MLOps agent
                        message_type=MessageType.TASK_ASSIGNMENT,
                        content={"task_type": "trigger_retraining_pipeline", **task_data},
                        trace_id=integration_id
                    )
                    await self.communication.send_message(message)
                    action_log["status"] = "retraining_triggered"; action_log["expert_id"] = expert_id
                    action_log["message_id"] = message.id
                    log_entry_ref["triggered_messages"].append(message.id) # Track triggered message
                elif not expert_id: action_log["error"] = "Missing 'expert_id'"
                elif not self.communication: action_log["error"] = "CommunicationSystem unavailable"
            # Add other action types (update_expert_config, flag_for_review)
            else: action_log["error"] = "Unsupported action type"
        except Exception as exec_e:
             self.logger.error(f"Error executing async action {action_type}: {exec_e}", exc_info=False)
             action_log["error"] = str(exec_e)
        return action_log

    # get_history remains synchronous

# --- Enhanced Holistic Performance Optimizer (PO Pillar - Async) ---
class HolisticOptimizer:
    """ Optimizes performance asynchronously towards holistic objectives. """
    def __init__(self, config: Dict, knowledge_graph: Any,
                 decision_engine: 'HybridDecisionEngine', moe_manager: Any):
        self.config = config; self.kg = knowledge_graph; self.decision_engine = decision_engine
        self.moe_manager = moe_manager; self.objectives = {}; self.targets = {}; self.baselines = {}
        self.metric_history = defaultdict(lambda: deque(maxlen=1000))
        self.forecasting_models = {}; self.optimization_history = deque(maxlen=500)
        self.logger = logging.getLogger('MIZ-OKI.HolisticOptimizer')
        self.logger.info("Holistic Optimizer initialized (Async).")
        self._load_objectives_from_config() # Sync setup

    # _load_objectives_from_config, register_forecasting_model, update_metric, _get_current_metric_value remain synchronous

    async def _predict_metric_value(self, metric_name, horizon_steps=1):
        """ Predict future metric value asynchronously using MoE. """
        if metric_name not in self.forecasting_models:
            # Basic trend fallback (sync)
            # ... (trend logic) ...
            return None # Or return trend prediction

        expert_id = self.forecasting_models[metric_name]
        self.logger.info(f"Attempting async prediction for {metric_name} using expert {expert_id} via MoE.")
        recent_history = [h['value'] for h in list(self.metric_history.get(metric_name, []))[-20:]]
        if not recent_history: return None
        input_data = {"historical_values": recent_history, "steps_to_predict": horizon_steps}
        try:
            # Assume invoke_expert is async or wrap it
            prediction_result = await self.moe_manager.invoke_expert(expert_id, input_data)
            # ... (result parsing logic remains similar) ...
            if prediction_result and isinstance(prediction_result, list) and len(prediction_result) > 0:
                 # Assuming prediction is directly in the list or nested
                 pred_list = prediction_result[0].get('prediction', prediction_result) if isinstance(prediction_result[0], dict) else prediction_result
                 if isinstance(pred_list, list) and len(pred_list) >= horizon_steps:
                      predicted_value = pred_list[horizon_steps - 1]
                      self.logger.info(f"Async forecasted {metric_name} using {expert_id}: {predicted_value}")
                      return predicted_value
            self.logger.warning(f"Async forecasting expert {expert_id} returned invalid data.")
        except Exception as e:
            self.logger.error(f"Async forecasting failed for {metric_name}: {e}", exc_info=False)
        return None

    # _evaluate_objectives remains synchronous logic

    async def check_and_optimize(self, predictive=False):
        """ Evaluate objectives asynchronously and trigger optimization decisions via HDE. """
        self.logger.info(f"Running async optimization check (Predictive: {predictive})...")
        optimization_triggered = False
        current_state = {m: self._get_current_metric_value(m) for m in self.metric_history if self._get_current_metric_value(m) is not None}
        predicted_state = {}
        if predictive:
            relevant_metrics = set(m for obj in self.objectives.values() for m in obj.get("metrics", []))
            predict_tasks = {metric: self._predict_metric_value(metric, horizon_steps=self.config.get("prediction_horizon", 3)) for metric in relevant_metrics}
            results = await asyncio.gather(*predict_tasks.values())
            predicted_state = {metric: res for metric, res in zip(predict_tasks.keys(), results) if res is not None}

        state_to_evaluate = predicted_state if predictive and predicted_state else current_state
        if not state_to_evaluate: self.logger.info("Insufficient metric data."); return False

        objective_scores = self._evaluate_objectives(state_to_evaluate)
        threshold = self.config.get("optimization_threshold", 0.7)

        for obj_id, score in objective_scores.items():
            if score < threshold:
                self.logger.warning(f"Objective '{obj_id}' score ({score:.2f}) below threshold. Triggering async optimization.")
                decision_context = {"current_metrics": current_state, "predicted_metrics": predicted_state, "objective_scores": objective_scores, "failing_objective_id": obj_id, "failing_objective_details": self.objectives.get(obj_id, {})}
                # Trigger HDE asynchronously
                optimization_decision_log = await self.decision_engine.make_decision("system_optimization", decision_context)
                opt_log = {"timestamp": datetime.datetime.now().isoformat(), "trigger": "predictive" if predictive else "reactive", "reason": f"Objective '{obj_id}' score {score:.2f} < {threshold}", "state_evaluated": state_to_evaluate, "decision_log_ref": optimization_decision_log.get("decision_id")}
                self.optimization_history.append(opt_log)
                optimization_triggered = True
                break # Trigger only once per cycle (MVP)

        if not optimization_triggered: self.logger.info("All objectives on track.")
        return optimization_triggered

    # get_optimization_history, get_objective_status remain synchronous

# --- Enhanced Autonomous Goal Generator (Async & MoA Comms) ---
class AutonomousGoalGenerator:
    """ Autonomously identifies opportunities and generates goals, triggering agents via messages. """
    def __init__(self, knowledge_graph: Any, optimizer: 'HolisticOptimizer',
                 config: Dict, communication_system: Optional[UnifiedCommunicationSystem]): # Use MoA Comms
        self.kg = knowledge_graph; self.optimizer = optimizer; self.config = config
        self.communication = communication_system # Store comms system
        self.goals = {}; self.goal_history = deque(maxlen=1000)
        self.logger = logging.getLogger('MIZ-OKI.AutonomousGoals')
        if not self.communication: self.logger.warning("CommunicationSystem not provided. Cannot trigger goal pursuit tasks.")
        self.logger.info("Autonomous Goal Generator initialized (Async & MoA Integrated).")

    async def identify_and_generate_goals(self):
        """ Autonomously analyzes system state asynchronously and generates goals. """
        self.logger.info("Running async goal identification cycle...")
        new_goals_generated = 0
        objective_status = self.optimizer.get_objective_status() # Sync call ok
        goal_gen_threshold = self.config.get("goal_generation_threshold", 0.6)

        generation_tasks = []
        for obj_id, score in objective_status.items():
            if score < goal_gen_threshold:
                if not any(g["status"] == "active" and g.get("related_objective_id") == obj_id for g in self.goals.values()):
                    # Create task to generate and add goal asynchronously
                    generation_tasks.append(self._generate_and_add_goal(obj_id, score))

        results = await asyncio.gather(*generation_tasks)
        new_goals_generated = sum(1 for res in results if res is not None)

        self.logger.info(f"Async goal identification complete. Generated {new_goals_generated} new goals.")
        return new_goals_generated

    async def _generate_and_add_goal(self, obj_id, score):
        """ Helper to generate details and add a single goal asynchronously. """
        try:
            obj_config = self.optimizer.objectives.get(obj_id, {})
            goal_desc = f"Improve objective: '{obj_config.get('description', obj_id)}' (Score: {score:.2f})"
            kpis = obj_config.get("metrics", [])
            owner_agent = "OptimizationAgent" # Placeholder - needs better assignment logic
            priority = max(0.1, min(1.0, (1.0 - score) * 1.5))
            target_values = {kpi: self.config.get("targets", {}).get(kpi) for kpi in kpis if self.config.get("targets", {}).get(kpi) is not None}
            return await self.add_goal(goal_desc, kpis, owner_agent, priority=priority, target_values=target_values, related_objective_id=obj_id, source="autonomous_po")
        except Exception as e:
            self.logger.error(f"Failed to generate goal for objective {obj_id}: {e}")
            return None

    async def add_goal(self, description, kpis, owner_agent, priority=0.5, target_values=None, related_objective_id=None, source="unknown"):
        """ Adds a new goal asynchronously and triggers the owner agent via message. """
        if not description or not kpis: return None
        goal_id = f"goal_{uuid.uuid4().hex[:12]}"
        goal_data = { "id": goal_id, "description": description, "kpis": kpis, "target_values": target_values or {}, "related_objective_id": related_objective_id, "owner_agent": owner_agent, "priority": max(0.0, min(1.0, priority)), "status": "active", "progress": 0.0, "source": source, "created_at": datetime.datetime.now().isoformat(), "updated_at": datetime.datetime.now().isoformat()}
        self.goals[goal_id] = goal_data
        log_entry = {"timestamp": goal_data["created_at"], "goal_id": goal_id, "action": "created", "details": "..."} # Avoid logging full data
        self.goal_history.append(log_entry)
        self.logger.info(f"Added new goal '{goal_id}' (Prio: {priority:.2f}): {description}")

        if self.communication:
            task_data = {"goal_id": goal_id, "goal_details": goal_data}
            message = AgentMessage(
                sender=f"GoalGenerator:{source}", receiver=owner_agent,
                message_type=MessageType.TASK_ASSIGNMENT,
                content={"task_type": "pursue_goal", **task_data},
                priority=int(priority * 10), trace_id=goal_id
            )
            await self.communication.send_message(message)
            self.logger.info(f"Sent pursue_goal message {message.id} to agent '{owner_agent}' for goal '{goal_id}'.")
            goal_data["pursuit_message_id"] = message.id
        else: self.logger.warning(f"Cannot trigger pursuit for goal '{goal_id}': CommunicationSystem unavailable.")
        return goal_id

    # update_goal_progress, retire_goal, get_active_goals, get_goal, get_history remain mostly synchronous logic

# --- Enhanced Self-Correcting Feedback (Async & MoA Comms) ---
class SelfCorrectingFeedback:
    """ Processes feedback asynchronously, triggers LI or workflow adaptation via messages. """
    def __init__(self, knowledge_graph: Any, learning_integrator: 'LearningIntegration',
                 communication_system: Optional[UnifiedCommunicationSystem], # Use MoA Comms
                 workflow_evolver: Optional[Any] = None): # Keep workflow evolver concept
        self.kg = knowledge_graph; self.learning_integrator = learning_integrator
        self.communication = communication_system # Store comms system
        self.workflow_evolver = workflow_evolver
        self.feedback_history = deque(maxlen=5000)
        self.correction_rules = defaultdict(dict)
        self.logger = logging.getLogger('MIZ-OKI.SelfCorrectingFeedback')
        if not self.communication: self.logger.warning("CommunicationSystem not provided. Cannot trigger corrective tasks.")
        self.logger.info("Self-Correcting Feedback process initialized (Async & MoA Integrated).")

    # register_correction_rule remains synchronous setup

    async def process_feedback(self, entity_id, feedback_data, source):
        """ Processes feedback asynchronously, applies rules, triggers LI or messages. """
        feedback_id = f"fb_{uuid.uuid4().hex[:12]}"
        log_entry = {"feedback_id": feedback_id, "timestamp": datetime.datetime.now().isoformat(), "entity_id": entity_id, "source": source, "feedback_data": "...", "status": "pending", "corrections_identified": [], "triggered_actions": []} # Avoid logging full data

        try:
            entity = await self.kg.get_entity(entity_id) # Use async KG method
            if not entity: raise ValueError(f"Entity {entity_id} not found.")
            entity_type = entity.get("type", "unknown"); log_entry["entity_type"] = entity_type

            # Apply Correction Rules (sync for now)
            correction_actions = []
            # ... (rule application logic) ...

            # Trigger Actions Asynchronously
            triggered_actions_summary = []
            action_tasks = [self._execute_correction_action(action, feedback_id, log_entry) for action in correction_actions]
            action_results = await asyncio.gather(*action_tasks, return_exceptions=True)
            # ... (process action_results similar to LI) ...

            log_entry["triggered_actions"] = triggered_actions_summary
            log_entry["status"] = "processed"

        except Exception as e:
            self.logger.error(f"Error processing feedback async for {entity_id}: {e}", exc_info=True)
            log_entry["status"] = "failed"; log_entry["error"] = str(e)

        self.feedback_history.append(log_entry)
        return log_entry

    async def _execute_correction_action(self, action: Dict, feedback_id: str, log_entry_ref: Dict) -> Dict:
        """ Executes a single correction action asynchronously. """
        action_type = action.get("type"); action_log = {"type": action_type}
        try:
            if action_type == "trigger_learning":
                # Use async LI
                li_result = await self.learning_integrator.integrate_learning(
                    knowledge_type=action.get("knowledge_type", "correction_feedback"),
                    knowledge_data=action.get("data", {}), # Pass relevant data
                    source=f"feedback:{feedback_id}",
                    importance=action.get("importance", 0.8)
                )
                action_log["result_id"] = li_result.get("integration_id"); action_log["status"] = li_result.get("status")
            elif action_type == "trigger_workflow_evolution":
                workflow_id = action.get("workflow_id"); evolution_details = action.get("details")
                if workflow_id and self.communication:
                    message = AgentMessage(
                        sender=f"Feedback:{feedback_id}", receiver="WorkflowEvolutionAgent", # Target agent
                        message_type=MessageType.TASK_ASSIGNMENT,
                        content={"task_type": "evolve_workflow", "workflow_id": workflow_id, "details": evolution_details},
                        trace_id=feedback_id
                    )
                    await self.communication.send_message(message)
                    action_log["workflow_id"] = workflow_id; action_log["message_id"] = message.id
                    action_log["status"] = "evolution_triggered"
                    log_entry_ref["triggered_messages"] = log_entry_ref.get("triggered_messages", []) + [message.id]
                elif not workflow_id: action_log["error"] = "Missing 'workflow_id'"
                elif not self.communication: action_log["error"] = "CommunicationSystem unavailable"
            # Add other action types (direct_kg_update using async KU)
            else: action_log["error"] = "Unsupported action type"
        except Exception as exec_e:
             self.logger.error(f"Error executing async correction action {action_type}: {exec_e}", exc_info=False)
             action_log["error"] = str(exec_e)
        return action_log

    # analyze_feedback_patterns, get_history remain mostly synchronous logic

# --- Enhanced Conflict Resolution (Async) ---
class ConflictResolution:
    """ Detects and resolves conflicts asynchronously in the KG. """
    def __init__(self, knowledge_graph: Any):
        self.kg = knowledge_graph; self.conflict_history = deque(maxlen=1000)
        self.resolution_rules = {}; self.logger = logging.getLogger('MIZ-OKI.ConflictResolution')
        self.logger.info("Conflict Resolution process initialized (Async).")

    # register_resolution_rule remains synchronous setup

    async def detect_conflicts_for_update(self, hints: Dict, current_data: Optional[Dict], proposed_updates: Dict) -> List[Dict]:
        """ Detect conflicts asynchronously for a proposed update. """
        conflicts = []; merged_data = (current_data or {}).copy(); merged_data.update(proposed_updates); merged_data['_hints'] = hints
        detection_tasks = []
        for conflict_type, rule in self.resolution_rules.items():
             detection_tasks.append(self._run_single_detection(conflict_type, rule["detection"], hints, merged_data))

        results = await asyncio.gather(*detection_tasks, return_exceptions=True)
        for i, res in enumerate(results):
             conflict_type = list(self.resolution_rules.keys())[i]
             if isinstance(res, Exception):
                  self.logger.error(f"Conflict detection rule {conflict_type} failed: {res}", exc_info=False)
             elif res: # If detection function returned details
                  conflicts.append({"conflict_type": conflict_type, "hints": hints, "details": res, "current_data": current_data, "proposed_updates": proposed_updates})
        return conflicts

    async def _run_single_detection(self, conflict_type, detection_func, hints, merged_data):
        """ Helper to run a single detection rule asynchronously. """
        if asyncio.iscoroutinefunction(detection_func):
            return await detection_func(hints, merged_data, self.kg)
        else:
            return await asyncio.to_thread(detection_func, hints, merged_data, self.kg) # Wrap sync call

    async def resolve_conflict(self, conflict: Dict) -> Dict:
        """ Attempts to resolve a detected conflict asynchronously. """
        conflict_type = conflict.get("conflict_type", "unknown"); hints = conflict.get("hints")
        log_entry = {"timestamp": datetime.datetime.now().isoformat(), "conflict": "...", "status": "pending", "resolution_details": None, "updated_attributes": None} # Avoid logging full conflict data

        if conflict_type not in self.resolution_rules:
            log_entry["status"] = "failed"; log_entry["error"] = "No resolution rule."
            self.conflict_history.append(log_entry); return {"success": False, **log_entry}

        try:
            resolution_func = self.resolution_rules[conflict_type]["resolution"]
            # Run resolution func async (wrap if sync)
            if asyncio.iscoroutinefunction(resolution_func):
                 resolution_result = await resolution_func(hints, conflict.get("details"), self.kg, conflict.get("current_data"), conflict.get("proposed_updates"))
            else:
                 resolution_result = await asyncio.to_thread(resolution_func, hints, conflict.get("details"), self.kg, conflict.get("current_data"), conflict.get("proposed_updates"))

            if not isinstance(resolution_result, dict) or "success" not in resolution_result: raise TypeError("Resolution func invalid return.")
            log_entry["status"] = "resolved" if resolution_result.get("success") else "resolution_failed"
            log_entry["resolution_details"] = resolution_result.get("details"); log_entry["updated_attributes"] = resolution_result.get("updated_attributes")
            self.conflict_history.append(log_entry)
            return {**log_entry, "success": resolution_result.get("success", False)}
        except Exception as e:
            self.logger.error(f"Async conflict resolution failed for {conflict_type} on {hints}: {e}", exc_info=True)
            log_entry["status"] = "resolution_error"; log_entry["error"] = str(e)
            self.conflict_history.append(log_entry); return {"success": False, **log_entry}

    # run_global_conflict_scan, get_history remain mostly synchronous logic

# --- Initialization ---
# Assume dependencies _eshkg, _moe_manager, _communication_system, _fm_client, CONFIG are available
_eshkg = eshkg if 'eshkg' in locals() else PlaceholderKG()
_moe_manager = moe_manager if 'moe_manager' in locals() else PlaceholderMoEManager()
_communication_system = miz_moa_system.communication_system if 'miz_moa_system' in locals() and miz_moa_system else PlaceholderCommunicationSystem()
_fm_client = foundation_model_client if 'foundation_model_client' in locals() else PlaceholderFMClient()
_adaptive_workflows = adaptive_workflows if 'adaptive_workflows' in locals() else PlaceholderWorkflowEvolver()
if 'CONFIG' not in globals(): CONFIG = {} # Use empty dict if not loaded

# Instantiate Core Process Layer Components (Async & MoA Integrated)
ethical_guardrails = EthicalGuardrailsEngine(CONFIG)
conflict_resolver = ConflictResolution(_eshkg)
knowledge_updater = KnowledgeUpdate(_eshkg, conflict_resolver)
hybrid_decision_engine = HybridDecisionEngine(_eshkg, _moe_manager, ethical_guardrails, _fm_client)
learning_integrator = LearningIntegration(_eshkg, _moe_manager, knowledge_updater, _communication_system) # Pass MoA Comms
self_correcting_feedback = SelfCorrectingFeedback(_eshkg, learning_integrator, _communication_system, _adaptive_workflows) # Pass MoA Comms
holistic_optimizer = HolisticOptimizer(CONFIG, _eshkg, hybrid_decision_engine, _moe_manager)
autonomous_goal_generator = AutonomousGoalGenerator(_eshkg, holistic_optimizer, CONFIG, _communication_system) # Pass MoA Comms

# --- Example Rule Registrations (Remain Synchronous Setup) ---
# ... (Register rules as before) ...

print("--- MIZ 3.0 Core Processes Layer Initialized (Async & MoA Integrated) ---")
# ... (Print summary as before) ...
print("-------------------------------------------------------------")

# Example Async Usage Snippet (Conceptual)
# async def simulate_core_processes():
#     print("\nSimulating Async Core Process Interactions...")
#     # 1. Update metric (sync)
#     holistic_optimizer.update_metric("roas", 2.1)
#     # 2. Optimizer checks async, triggers HDE async
#     await holistic_optimizer.check_and_optimize()
#     # 3. HDE makes decision async (action requirement logged)
#     # 4. Feedback processed async, triggers LI async
#     await self_correcting_feedback.process_feedback("customer:xyz", {"type": "accuracy"}, "human")
#     # 5. LI triggers MLOps via async message
#
# # To run: asyncio.run(simulate_core_processes())



AttributeError: 'HolisticOptimizer' object has no attribute '_load_objectives_from_config'

In [28]:
# Cell 6: Technical Flow Components (MIZ 3.0 OKI - Reworked)
# Status: RAG uses LLaMA 4 embeddings/generation via FM Client/NN. RL is base class. MoE logic refined. NN uses FM Client. R2 integrates LLaMA 4 calls. Vector DB integration via KG Adapter assumed.

import tensorflow as tf
from tensorflow.keras import layers, models, optimizers # Ensure TF components are imported
import numpy as np
# import networkx as nx # Removed dependency for core RAG logic
import os
import json
import logging
import time
import random
import functools
from typing import Dict, List, Any, Optional, Union, Tuple

# --- Placeholder Dependencies ---
# from cell3 import EnhancedSelfHealingKG, GraphStorageAdapter # Assumed KG uses Adapter
# from cell4 import MixtureOfExpertsManager # MoEManager is central now
# from cell5 import HybridDecisionEngine # For potential integration with R2
# from cell8 import DynamicRewardSystem # For RL integration
# from cell18 import FoundationModelClient # Assumed available

class PlaceholderKG: # Placeholder if real KG not available
    def get_entity(self, *args, **kwargs): logger.debug(f"PlaceholderKG.get_entity called"); return {"type": "placeholder", "mizId": args[0], "description": "Placeholder entity data."}
    def get_neighbors(self, *args, **kwargs): logger.debug(f"PlaceholderKG.get_neighbors called"); return [{"neighborId": f"neigh_{random.randint(100,999)}", "neighborProps": {"type": "related"}, "relationshipType": "RELATED_TO"}]
    # Assume adapter is part of KG or passed separately
    class PlaceholderAdapter:
        def search_by_vector(self, *args, **kwargs): logger.debug(f"PlaceholderAdapter.search_by_vector called"); return [(f"vec_match_{i}", random.random()) for i in range(5)]
        def execute_query(self, *args, **kwargs): logger.debug(f"PlaceholderAdapter.execute_query called"); return [] # Placeholder for KG queries in R2
    adapter = PlaceholderAdapter()

class PlaceholderFMClient: # Placeholder if real FM Client not available
    def generate_text(self, prompt, *args, **kwargs): logger.debug(f"PlaceholderFMClient.generate_text called"); return f"LLaMA4 response to: {prompt[:50]}..."
    def generate_embedding(self, data, *args, **kwargs):
        logger.debug(f"PlaceholderFMClient.generate_embedding called")
        if isinstance(data, list): return [np.random.rand(768) for _ in data] # Example dimension
        else: return np.random.rand(768)
# --- End Placeholder Dependencies ---


# Use the global logger
logger = logging.getLogger('MIZ-OKI.TechnicalFlows')

# --- Semantic Graph RAG (RG Pillar - Reworked) ---
class SemanticGraphRAG:
    """
    Implements Graph-enhanced RAG using LLaMA 4 embeddings/generation.
    Relies on KG Adapter for vector search and graph traversal. (MIZ 3.0 RG Pillar)
    """
    def __init__(self, knowledge_graph: 'PlaceholderKG', fm_client: 'PlaceholderFMClient', neural_processor: 'NeuralProcessing'):
        self.kg = knowledge_graph # Dependency: EnhancedSelfHealingKG instance
        self.fm_client = fm_client # Dependency: FoundationModelClient instance
        self.neural_processor = neural_processor # Dependency: NeuralProcessing instance
        self.logger = logging.getLogger('MIZ-OKI.SemanticGraphRAG')
        if not self.kg or not hasattr(self.kg, 'adapter'):
             self.logger.error("Knowledge Graph or its adapter not available. RAG functionality limited.")
        if not self.fm_client:
             self.logger.error("FoundationModelClient not available. RAG generation/embedding limited.")
        if not self.neural_processor:
             self.logger.error("NeuralProcessing component not available. Cannot generate embeddings.")
        self.logger.info("Semantic Graph RAG initialized.")

    def _get_embedding(self, text: str, model_id: str = "llama4_embedding_model") -> Optional[np.ndarray]:
        """Helper to get embedding via NeuralProcessing -> FM Client."""
        if not self.neural_processor: return None
        # Assume 'text' is the data_type for embedding text
        return self.neural_processor.get_embedding(text, data_type="text", model_id=model_id)

    def retrieve_nodes_semantic(self, query: str, k: int = 5, entity_types: Optional[List[str]] = None,
                                attribute_filters: Optional[Dict] = None,
                                vector_index_name: str = "entity_embeddings") -> List[Tuple[str, float]]:
        """
        Retrieve relevant nodes from the KG using semantic vector search via KG Adapter.
        """
        if not self.kg or not hasattr(self.kg, 'adapter') or not hasattr(self.kg.adapter, 'search_by_vector'):
            self.logger.error("KG Adapter or vector search method not available.")
            return []
        if not self.neural_processor:
             self.logger.error("NeuralProcessing unavailable for query embedding.")
             return []

        query_embedding = self._get_embedding(query)
        if query_embedding is None:
            self.logger.error("Failed to generate query embedding.")
            return []

        # MIZ 3.0 TODO: Implement pre-filtering based on entity_types/attribute_filters in the vector search call if supported by the adapter/DB.
        # Example (conceptual):
        # filter_condition = build_filter_condition(entity_types, attribute_filters)
        # results = self.kg.adapter.search_by_vector(query_embedding.tolist(), vector_index_name, k, filter=filter_condition)

        # If pre-filtering isn't supported, perform vector search first, then filter results.
        try:
            vector_results = self.kg.adapter.search_by_vector(query_embedding.tolist(), vector_index_name, k * 5) # Fetch more results for filtering
        except Exception as e:
             self.logger.error(f"Vector search failed: {e}")
             return []

        # Post-filtering (if needed)
        filtered_results = []
        nodes_added = set()
        for node_id, score in vector_results:
            if len(filtered_results) >= k: break
            if node_id in nodes_added: continue # Avoid duplicates if search returns them

            # Apply filters
            passes_filter = True
            if entity_types or attribute_filters:
                 node_data = self.kg.get_entity(node_id) # Fetch data for filtering
                 if not node_data: continue # Skip if node data not found

                 if entity_types and node_data.get("type") not in entity_types:
                      passes_filter = False
                 if attribute_filters:
                      for attr, value in attribute_filters.items():
                           if node_data.get(attr) != value:
                                passes_filter = False
                                break
            if passes_filter:
                 filtered_results.append((node_id, score))
                 nodes_added.add(node_id)

        self.logger.info(f"Retrieved {len(filtered_results)} nodes via semantic search and filtering.")
        return filtered_results[:k]

    def retrieve_and_augment(self, query: str, k: int = 5, entity_types: Optional[List[str]] = None,
                             attribute_filters: Optional[Dict] = None, include_relationships: bool = True,
                             max_depth: int = 1, vector_index_name: str = "entity_embeddings") -> List[Dict]:
        """
        Retrieves relevant nodes and augments them with context using KG Adapter methods.
        """
        top_nodes = self.retrieve_nodes_semantic(query, k, entity_types, attribute_filters, vector_index_name)
        results = []

        if not self.kg:
             self.logger.error("Knowledge Graph not available for augmentation.")
             return [{"node_id": nid, "score": s, "data": None, "context": []} for nid, s in top_nodes]

        for node_id, score in top_nodes:
            node_data = self.kg.get_entity(node_id)
            if not node_data: continue

            result_item = {"node_id": node_id, "data": node_data, "score": score, "context": []}

            if include_relationships:
                try:
                    # Use KG adapter's neighbor fetching
                    neighbors_data = self.kg.get_neighbors(node_id, direction="both") # Fetch all neighbors up to adapter's limit/implementation
                    # MIZ 3.0 TODO: Implement depth limiting in get_neighbors or via multiple calls if needed.
                    context_count = 0
                    max_context_items = 10 # Limit context size
                    for neighbor_info in neighbors_data:
                         if context_count >= max_context_items: break
                         neighbor_props = neighbor_info.get("neighborProps", {})
                         result_item["context"].append({
                             "neighbor_id": neighbor_info.get("neighborId"),
                             "neighbor_type": neighbor_props.get("type"),
                             "relationship": neighbor_info.get("relationshipType"),
                             "neighbor_preview": {k: v for k, v in neighbor_props.items() if k in ['name', 'status'] and v is not None}
                         })
                         context_count += 1
                except Exception as e:
                     self.logger.warning(f"Error getting neighbors for {node_id} via adapter: {e}")

            results.append(result_item)
        return results

    def generate_response(self, query: str, retrieved_context: List[Dict],
                          model_alias: str = "llama4_scout", max_tokens: int = 512) -> Optional[str]:
        """Generates a response using LLaMA 4, grounded in the retrieved context."""
        if not self.fm_client:
             self.logger.error("FoundationModelClient not available for generation.")
             return None
        if not retrieved_context:
             self.logger.warning("No context provided for generation. Generating based on query only.")
             context_str = "No specific context available."
        else:
             # Format context for the prompt
             context_parts = []
             for item in retrieved_context[:3]: # Limit context in prompt
                  data_preview = json.dumps({k:v for k,v in item.get('data',{}).items() if k != 'embedding'}, default=str, indent=0)[:200]
                  context_parts.append(f"Node ID: {item.get('node_id')}\nScore: {item.get('score', 0):.2f}\nData: {data_preview}...")
                  # Add neighbor info if needed
             context_str = "\n---\n".join(context_parts)

        prompt = f"""Based on the following context retrieved from the knowledge graph, answer the query.
Context:
---
{context_str}
---
Query: {query}
Answer:"""

        try:
            response = self.fm_client.generate_text(prompt, model_alias=model_alias, max_tokens=max_tokens)
            return response
        except Exception as e:
             self.logger.error(f"LLaMA 4 generation failed in RAG: {e}")
             return None

    def explain_retrieval(self, query, results):
        """Provide explanation for retrieval results (Placeholder)."""
        # MIZ 3.0 TODO: Implement more sophisticated explanation, potentially using LLaMA 4 to analyze query vs node content.
        explanations = []
        for result in results:
            explanations.append({
                "node_id": result.get("node_id"),
                "relevance_score": result.get("score"),
                "reason": f"Retrieved based on semantic similarity score {result.get('score', 0):.2f} (Explanation Placeholder)."
            })
        return explanations

# --- Context-Adaptive Reinforcement Learning (RL Pillar - Base Class) ---
class ContextAdaptiveRL:
    """
    Base class for Reinforcement Learning agents. (MIZ 3.0 RL Pillar)
    Advanced implementations (Offline, MARL) are in Cell 8 or specialized agents.
    """
    def __init__(self, state_dim, action_dim, hidden_dim=64, learning_rate_actor=0.001, learning_rate_critic=0.002, gamma=0.99, epsilon_decay=0.995, batch_size=64, max_memory_size=10000):
        self.state_dim = state_dim
        self.action_dim = action_dim
        self.hidden_dim = hidden_dim
        self.gamma = gamma
        self.epsilon = 1.0
        self.epsilon_decay = epsilon_decay
        self.epsilon_min = 0.01
        self.batch_size = batch_size
        self.memory = deque(maxlen=max_memory_size) # Use deque

        self.actor = self._build_network(self.state_dim, self.action_dim, activation='softmax', name='Actor')
        self.critic = self._build_network(self.state_dim, 1, activation=None, name='Critic')

        self.actor_optimizer = tf.keras.optimizers.Adam(learning_rate=learning_rate_actor)
        self.critic_optimizer = tf.keras.optimizers.Adam(learning_rate=learning_rate_critic)

        self.logger = logging.getLogger('MIZ-OKI.ContextAdaptiveRL')
        self.logger.info("Context-Adaptive RL base agent initialized.")

    def _build_network(self, input_dim, output_dim, activation, name):
        """Builds a simple dense network."""
        model = tf.keras.Sequential(name=name)
        model.add(layers.Input(shape=(input_dim,)))
        model.add(layers.Dense(self.hidden_dim, activation='relu'))
        model.add(layers.Dense(self.hidden_dim, activation='relu'))
        model.add(layers.Dense(output_dim, activation=activation))
        # No compilation here; losses handled in train step
        return model

    def get_action(self, state, explore=True):
        """Get action using epsilon-greedy policy."""
        if explore and np.random.rand() < self.epsilon:
            return np.random.choice(self.action_dim)
        try:
            state = np.reshape(state, [1, self.state_dim])
            action_probs = self.actor(state, training=False)[0] # Use direct call
            return np.argmax(action_probs)
        except Exception as e:
             self.logger.error(f"Error during actor prediction: {e}. Choosing random action.")
             return np.random.choice(self.action_dim)

    def remember(self, state, action, reward, next_state, done):
        """Store experience."""
        self.memory.append((state, action, reward, next_state, done))

    def train(self):
        """Train Actor-Critic networks using experience replay."""
        if len(self.memory) < self.batch_size: return None
        batch = random.sample(self.memory, self.batch_size)
        states, actions, rewards, next_states, dones = map(np.array, zip(*batch))
        rewards = rewards.astype(np.float32)
        dones = dones.astype(np.float32)

        try:
            # Train Critic
            with tf.GradientTape() as tape:
                next_vals = tf.squeeze(self.critic(next_states, training=True))
                target_vals = rewards + self.gamma * next_vals * (1.0 - dones)
                current_vals = tf.squeeze(self.critic(states, training=True))
                advantages = target_vals - current_vals
                critic_loss = tf.keras.losses.mean_squared_error(target_vals, current_vals)
            critic_grads = tape.gradient(critic_loss, self.critic.trainable_variables)
            self.critic_optimizer.apply_gradients(zip(critic_grads, self.critic.trainable_variables))

            # Train Actor
            with tf.GradientTape() as tape:
                action_probs = self.actor(states, training=True)
                actions_onehot = tf.one_hot(actions, self.action_dim, dtype=tf.float32)
                log_probs = tf.math.log(tf.reduce_sum(action_probs * actions_onehot, axis=1) + 1e-10)
                actor_loss = -tf.reduce_mean(log_probs * tf.stop_gradient(advantages))
            actor_grads = tape.gradient(actor_loss, self.actor.trainable_variables)
            self.actor_optimizer.apply_gradients(zip(actor_grads, self.actor.trainable_variables))

            if self.epsilon > self.epsilon_min: self.epsilon *= self.epsilon_decay
            metrics = {"critic_loss": float(tf.reduce_mean(critic_loss)), "actor_loss": float(actor_loss), "epsilon": self.epsilon}
            return metrics
        except Exception as e:
            self.logger.error(f"Training step failed: {e}", exc_info=True)
            return None

    # Save/Load methods remain similar

# --- Dynamic Expert Evolution (MoE Pillar - Logic Component) ---
class DynamicExpertEvolution:
    """
    Implements logic for self-organizing Mixture of Experts evolution. (MIZ 3.0 MoE Pillar)
    Requires integration with MoEManager and MLOps pipeline.
    """
    def __init__(self, config: Dict, input_dim, output_dim, num_experts=3, expert_hyperparams=None, gating_hyperparams=None):
        self.config = config
        self.input_dim = input_dim; self.output_dim = output_dim; self.num_experts = num_experts
        self.expert_hyperparams = expert_hyperparams or {"hidden_layers": [64, 32], "activation": "relu"}
        self.gating_hyperparams = gating_hyperparams or {"hidden_layers": [32], "activation": "relu"}
        self.experts = self._build_experts()
        self.gating = self._build_gating()
        self.expert_performance = {i: {'calls': 0, 'score': 0.5} for i in range(num_experts)}
        # Load thresholds from config
        self.evolution_threshold = self.config.get("moe_evolution_threshold", 0.7)
        self.expert_creation_threshold = self.config.get("moe_creation_threshold", 0.85)
        self.expert_retirement_threshold = self.config.get("moe_retirement_threshold", 0.3)
        self.max_experts = self.config.get("max_experts", 12)
        self.logger = logging.getLogger('MIZ-OKI.DynamicExpertEvolution')
        self.logger.info(f"Dynamic Expert Evolution logic initialized with {num_experts} experts.")

    # _build_single_expert, _build_experts, _build_gating remain similar
    def _build_single_expert(self):
        model = tf.keras.Sequential()
        model.add(layers.Input(shape=(self.input_dim,)))
        for units in self.expert_hyperparams.get("hidden_layers", [64, 32]): model.add(layers.Dense(units, activation=self.expert_hyperparams.get("activation", "relu")))
        model.add(layers.Dense(self.output_dim, activation='linear'))
        return model
    def _build_experts(self): return [self._build_single_expert() for _ in range(self.num_experts)]
    def _build_gating(self):
        inputs = tf.keras.layers.Input(shape=(self.input_dim,)); x = inputs
        for units in self.gating_hyperparams.get("hidden_layers", [32]): x = tf.keras.layers.Dense(units, activation=self.gating_hyperparams.get("activation", "relu"))(x)
        outputs = tf.keras.layers.Dense(self.num_experts, activation='softmax')(x)
        return tf.keras.Model(inputs=inputs, outputs=outputs)

    # predict_gating_weights, predict_expert_outputs, predict remain similar
    def predict_gating_weights(self, x):
        x = np.reshape(x, [-1, self.input_dim])
        try: return self.gating.predict(x, verbose=0)
        except Exception as e: self.logger.error(f"Gating prediction failed: {e}"); return np.ones((x.shape[0], self.num_experts)) / self.num_experts
    def predict_expert_outputs(self, x):
        x = np.reshape(x, [-1, self.input_dim]); expert_outputs = []
        for i, expert in enumerate(self.experts):
            try: expert_outputs.append(expert.predict(x, verbose=0))
            except Exception as e: self.logger.error(f"Expert {i} prediction failed: {e}"); expert_outputs.append(np.zeros((x.shape[0], self.output_dim)))
        return np.array(expert_outputs)
    def predict(self, x):
        expert_weights = self.predict_gating_weights(x); expert_outputs = self.predict_expert_outputs(x)
        expert_outputs_transposed = np.transpose(expert_outputs, (1, 0, 2))
        expanded_weights = np.expand_dims(expert_weights, axis=-1)
        weighted_outputs = expert_outputs_transposed * expanded_weights
        combined_output = np.sum(weighted_outputs, axis=1)
        return combined_output, expert_weights

    # train_step remains similar (MVP: primary loss only)
    def train_step(self, x_batch, y_batch, optimizer):
        with tf.GradientTape() as tape:
            expert_outputs = tf.stack([expert(x_batch, training=True) for expert in self.experts], axis=1)
            gating_weights = self.gating(x_batch, training=True)
            expanded_weights = tf.expand_dims(gating_weights, axis=-1)
            weighted_outputs = expert_outputs * expanded_weights
            combined_output = tf.reduce_sum(weighted_outputs, axis=1)
            primary_loss = tf.reduce_mean(tf.keras.losses.mean_squared_error(y_batch, combined_output))
            # MIZ 3.0 TODO: Add load balancing loss
            total_loss = primary_loss
        trainable_vars = self.gating.trainable_variables + [var for expert in self.experts for var in expert.trainable_variables]
        if not trainable_vars: return total_loss, gating_weights
        gradients = tape.gradient(total_loss, trainable_vars)
        optimizer.apply_gradients(zip(gradients, trainable_vars))
        return total_loss, gating_weights

    # update_expert_performance remains similar (simplified EMA)
    def update_expert_performance(self, gating_weights, batch_loss):
        if gating_weights is None: return
        try:
            avg_weights = np.mean(gating_weights, axis=0)
            if len(avg_weights) != self.num_experts: return # Safety check
            perf_signal = (1.0 - min(float(batch_loss), 2.0) / 2.0)
            for i in range(self.num_experts):
                score = self.expert_performance[i]['score']
                usage = avg_weights[i]
                self.expert_performance[i]['score'] = 0.9 * score + 0.1 * (usage * perf_signal)
                self.expert_performance[i]['calls'] += usage
        except Exception as e: self.logger.error(f"Error updating expert performance: {e}")

    # evolve_experts remains similar (MVP: score-based add/retire)
    def evolve_experts(self):
        evolved = False; retired_indices = []
        self.logger.info("Checking expert performance for evolution...")
        for i in range(self.num_experts):
            if i not in self.expert_performance: continue
            score = self.expert_performance[i]['score']; calls = self.expert_performance[i]['calls']
            self.logger.debug(f"Expert {i}: Score={score:.3f}, Calls={calls:.1f}")
            if score < self.expert_retirement_threshold and calls > 10 and self.num_experts > 1:
                self.logger.warning(f"Retiring expert {i} (score: {score:.3f}).")
                retired_indices.append(i); evolved = True
        if retired_indices:
            new_experts = []; new_performance = {}; new_idx = 0
            for i in range(self.num_experts):
                if i not in retired_indices:
                    new_experts.append(self.experts[i]); new_performance[new_idx] = self.expert_performance[i]; new_idx += 1
            self.experts = new_experts; self.expert_performance = new_performance; self.num_experts = len(self.experts)
            self.gating = self._build_gating(); self.logger.info(f"Rebuilt gating for {self.num_experts} experts.")
        avg_score = np.mean([p['score'] for p in self.expert_performance.values()]) if self.expert_performance else 0.0
        if avg_score > self.expert_creation_threshold and self.num_experts < self.max_experts:
            self.logger.info(f"High performance detected (Avg Score: {avg_score:.3f}). Adding new expert.")
            self.experts.append(self._build_single_expert()); new_idx = self.num_experts
            self.expert_performance[new_idx] = {'calls': 0, 'score': 0.5}; self.num_experts += 1
            self.gating = self._build_gating(); self.logger.info(f"Added expert. Total: {self.num_experts}. Rebuilt gating.")
            evolved = True
        if evolved: self.logger.info("Expert evolution cycle complete.")
        else: self.logger.info("No expert evolution occurred.")
        return evolved

# --- Neural Processing (NN Pillar - Reworked) ---
class NeuralProcessing:
    """
    Handles multimodal neural processing using FoundationModelClient for LLaMA 4. (MIZ 3.0 NN Pillar)
    """
    def __init__(self, config: Dict, fm_client: Optional['PlaceholderFMClient'] = None): # Inject FM Client
        self.config = config
        self.fm_client = fm_client # Store FM Client
        self.processors = {} # data_type -> processor_func(data) -> processed_for_embedding
        self.embedding_models = {} # model_id -> embedding_func(processed_data) -> embedding
        self.logger = logging.getLogger('MIZ-OKI.NeuralProcessing')
        if not self.fm_client: self.logger.warning("FoundationModelClient not provided. Embedding/processing capabilities limited.")
        self.logger.info("Neural Processing component initialized.")

    def register_processor(self, data_type, processor_func):
        """Register a pre-processor for a specific data type."""
        self.processors[data_type] = processor_func
        self.logger.info(f"Registered processor for data type: {data_type}")

    def register_embedding_model(self, model_id, embedding_func_or_alias: Union[Callable, str]):
        """Register an embedding function or an alias for FM Client embedding."""
        self.embedding_models[model_id] = embedding_func_or_alias
        self.logger.info(f"Registered embedding model/alias: {model_id}")

    def process_data(self, data, data_type):
        """Process data using the registered processor."""
        if data_type in self.processors:
            try: return self.processors[data_type](data)
            except Exception as e: self.logger.error(f"Processor for '{data_type}' failed: {e}"); return None
        return data # Return raw if no processor

    def get_embedding(self, data, data_type, model_id="llama4_embedding_model"): # Default to LLaMA 4 alias
        """Process data and get embedding, prioritizing FM Client."""
        if model_id not in self.embedding_models:
            # If ID not registered, try using FM Client directly if available
            if self.fm_client and hasattr(self.fm_client, 'generate_embedding'):
                 self.logger.debug(f"Model ID '{model_id}' not registered, attempting direct FM Client embedding.")
                 processed_data = self.process_data(data, data_type)
                 if processed_data is None: return None
                 try:
                      # Assume generate_embedding handles single/batch
                      return self.fm_client.generate_embedding(processed_data, model_alias=model_id)
                 except Exception as e:
                      self.logger.error(f"Direct FM Client embedding failed for '{model_id}': {e}")
                      return None
            else:
                 self.logger.error(f"Embedding model/alias '{model_id}' not found and FM Client unavailable/unsuitable.")
                 return None

        embedding_source = self.embedding_models[model_id]
        processed_data = self.process_data(data, data_type)
        if processed_data is None: return None

        try:
            if isinstance(embedding_source, str) and self.fm_client:
                # It's an alias for the FM Client
                embedding = self.fm_client.generate_embedding(processed_data, model_alias=embedding_source)
            elif callable(embedding_source):
                # It's a direct function
                embedding = embedding_source(processed_data)
            else:
                 raise TypeError(f"Invalid embedding source for '{model_id}': {embedding_source}")

            # Validate embedding format
            if not isinstance(embedding, np.ndarray): embedding = np.array(embedding)
            return embedding
        except Exception as e:
            self.logger.error(f"Embedding generation failed for '{model_id}': {e}", exc_info=False)
            return None

    def batch_embed(self, data_items: List, data_types: List[str], model_id="llama4_embedding_model"):
        """Get embeddings for a batch, prioritizing FM Client."""
        if model_id not in self.embedding_models:
            if self.fm_client and hasattr(self.fm_client, 'generate_embedding'):
                 self.logger.debug(f"Model ID '{model_id}' not registered, attempting direct FM Client batch embedding.")
                 processed_batch = [self.process_data(d, t) for d, t in zip(data_items, data_types)]
                 valid_indices = [i for i, p in enumerate(processed_batch) if p is not None]
                 valid_data = [p for p in processed_batch if p is not None]
                 if not valid_data: return [None] * len(data_items)
                 try:
                      batch_embeddings = self.fm_client.generate_embedding(valid_data, model_alias=model_id)
                      full_embeddings = [None] * len(data_items)
                      if len(batch_embeddings) == len(valid_indices):
                           for i, idx in enumerate(valid_indices): full_embeddings[idx] = batch_embeddings[i]
                      return full_embeddings
                 except Exception as e:
                      self.logger.error(f"Direct FM Client batch embedding failed for '{model_id}': {e}")
                      return [None] * len(data_items)
            else:
                 self.logger.error(f"Embedding model/alias '{model_id}' not found and FM Client unavailable/unsuitable.")
                 return [None] * len(data_items)

        embedding_source = self.embedding_models[model_id]
        processed_batch = [self.process_data(d, t) for d, t in zip(data_items, data_types)]
        valid_indices = [i for i, p in enumerate(processed_batch) if p is not None]
        valid_data = [p for p in processed_batch if p is not None]
        if not valid_data: return [None] * len(data_items)

        try:
            if isinstance(embedding_source, str) and self.fm_client:
                batch_embeddings = self.fm_client.generate_embedding(valid_data, model_alias=embedding_source)
            elif callable(embedding_source):
                # Assume callable can handle batch
                batch_embeddings = embedding_source(valid_data)
            else:
                 raise TypeError(f"Invalid embedding source for '{model_id}': {embedding_source}")

            # Reconstruct full result list
            full_embeddings = [None] * len(data_items)
            if len(batch_embeddings) == len(valid_indices):
                 for i, idx in enumerate(valid_indices): full_embeddings[idx] = batch_embeddings[i]
            else: self.logger.error("Batch embedding result size mismatch.")
            return full_embeddings
        except Exception as e:
            self.logger.error(f"Batch embedding failed for '{model_id}': {e}", exc_info=False)
            return [None] * len(data_items)

# --- R2 Reasoning (R2 Pillar - Reworked) ---
class R2Reasoning:
    """
    Implements step-by-step reasoning using templates, integrating LLaMA 4 calls. (MIZ 3.0 R2 Pillar)
    """
    def __init__(self, kg: Optional['PlaceholderKG'] = None,
                 decision_engine: Optional['HybridDecisionEngine'] = None, # Cell 5 HDE
                 fm_client: Optional['PlaceholderFMClient'] = None): # Cell 18 FM Client
        self.kg = kg
        self.decision_engine = decision_engine
        self.fm_client = fm_client # Store FM Client
        self.reasoning_templates = {}
        self.reasoning_history = deque(maxlen=1000) # Use deque
        self.logger = logging.getLogger('MIZ-OKI.R2Reasoning')
        if not self.fm_client: self.logger.warning("FoundationModelClient not provided. LLaMA 4 reasoning steps disabled.")
        self.logger.info("R2 Reasoning component initialized.")

    def register_template(self, template_id, steps_template, variables=None, conclusion_template=None):
        """Register a reasoning template."""
        if not isinstance(steps_template, list):
             self.logger.error("Steps template must be a list."); return False
        self.reasoning_templates[template_id] = {
            "steps": steps_template, "variables": variables or [],
            "conclusion": conclusion_template, "created_at": datetime.datetime.now().isoformat()
        }
        self.logger.info(f"Registered reasoning template: {template_id}"); return True

    def reason(self, template_id, input_data):
        """Perform step-by-step reasoning using a template with CoT logging and LLaMA 4 integration."""
        if template_id not in self.reasoning_templates:
            self.logger.error(f"Reasoning template '{template_id}' not found.")
            return None

        template = self.reasoning_templates[template_id]
        reasoning_id = f"r2_{template_id}_{uuid.uuid4().hex[:12]}"
        log_entry = {
            "reasoning_id": reasoning_id, "template_id": template_id, "input_data": input_data,
            "timestamp_start": datetime.datetime.now().isoformat(), "status": "running",
            "steps_executed": [], "variables_state": {}, "conclusion": None, "chain_of_thought": []
        }
        variables = {}; cot = log_entry["chain_of_thought"]

        try:
            # Initialize variables
            for var in template.get("variables", []): variables[var] = input_data.get(var)
            log_entry["variables_state"]["initial"] = variables.copy()
            cot.append(f"Initial state: {variables}")

            # Execute steps
            for i, step_config in enumerate(template["steps"]):
                step_log = {"step_number": i + 1, "config": step_config}
                start_step_time = time.time()
                step_description_formatted = f"Step {i+1}" # Default description

                try:
                    # Format description
                    step_description = step_config.get("text", f"Execute step {i+1}")
                    try: step_description_formatted = step_description.format(**variables)
                    except Exception as fmt_e: self.logger.warning(f"Formatting step {i+1} desc failed: {fmt_e}"); step_description_formatted = step_description
                    step_log["description"] = step_description_formatted
                    cot.append(f"Step {i+1}: {step_description_formatted}")

                    # Execute step logic
                    logic = step_config.get("logic")
                    step_result = None
                    info = None

                    if isinstance(logic, dict) and 'type' in logic: # Check for structured logic config
                        logic_type = logic.get('type')
                        if logic_type == 'llama4' and self.fm_client:
                            prompt_template = logic.get("prompt_template")
                            model_alias = logic.get("model_alias", "llama4_scout")
                            output_var = logic.get("output_variable")
                            if not prompt_template or not output_var: raise ValueError("Missing prompt_template or output_variable for llama4 logic.")
                            try: prompt = prompt_template.format(**variables)
                            except Exception as fmt_e: raise ValueError(f"Formatting LLaMA4 prompt failed: {fmt_e}")
                            cot.append(f"  > Calling LLaMA 4 ({model_alias}) with prompt: {prompt[:100]}...")
                            step_result = self.fm_client.generate_text(prompt, model_alias=model_alias)
                            if step_result is not None:
                                 variables[output_var] = step_result # Update variable directly
                                 info = f"LLaMA 4 ({model_alias}) response stored in '{output_var}'."
                                 cot.append(f"  > LLaMA 4 Response (stored in {output_var}): {str(step_result)[:100]}...")
                            else: raise RuntimeError("LLaMA 4 call returned None.")
                        elif logic_type == 'kg_query' and self.kg:
                             query_template = logic.get("query_template")
                             output_var = logic.get("output_variable")
                             if not query_template or not output_var: raise ValueError("Missing query_template or output_variable for kg_query logic.")
                             try: query = query_template.format(**variables)
                             except Exception as fmt_e: raise ValueError(f"Formatting KG query failed: {fmt_e}")
                             cot.append(f"  > Querying KG: {query}")
                             # Assumes adapter handles query execution
                             query_result = self.kg.adapter.execute_query(query)
                             variables[output_var] = query_result # Store full result list/dict
                             info = f"KG query result stored in '{output_var}' ({len(query_result)} records)."
                             cot.append(f"  > KG Result (stored in {output_var}): {str(query_result)[:100]}...")
                        # MIZ 3.0 TODO: Add logic types for 'decision_engine_call', 'causal_query', etc.
                        else: raise ValueError(f"Unsupported structured logic type: {logic_type} or required component missing.")

                    elif callable(logic): # Handle simple callable logic
                        step_result = logic(variables, kg=self.kg, decision_engine=self.decision_engine)
                        if step_result and isinstance(step_result, dict):
                            logic_output = {k:v for k,v in step_result.items() if not k.startswith('_')}
                            variables.update(logic_output)
                            step_log["logic_output_summary"] = logic_output
                            info = step_result.get("_info")
                        else: step_log["logic_output_summary"] = step_result
                    else:
                         self.logger.debug(f"Step {i+1} has no executable logic defined.")
                         info = "No logic executed."

                    step_log["status"] = "success"
                    step_log["info"] = info
                    if info: cot.append(f"  > Info: {info}")
                    log_entry["variables_state"][f"after_step_{i+1}"] = variables.copy()

                except Exception as step_e:
                    self.logger.error(f"Error executing step {i+1} of template {template_id}: {step_e}", exc_info=True)
                    step_log["status"] = "failed"; step_log["error"] = str(step_e)
                    log_entry["steps_executed"].append(step_log)
                    cot.append(f"  > STEP FAILED: {step_e}")
                    raise step_e # Stop reasoning on step failure

                step_log["duration_ms"] = (time.time() - start_step_time) * 1000
                log_entry["steps_executed"].append(step_log)

            # Generate conclusion
            conclusion_template = template.get("conclusion")
            if conclusion_template:
                try:
                     log_entry["conclusion"] = conclusion_template.format(**variables) if isinstance(conclusion_template, str) else conclusion_template(variables)
                     cot.append(f"Conclusion: {log_entry['conclusion']}")
                except Exception as fmt_e: self.logger.error(f"Formatting conclusion failed: {fmt_e}"); log_entry["conclusion"] = str(conclusion_template)

            log_entry["status"] = "success"

        except Exception as e:
            log_entry["status"] = "failed"; log_entry["error"] = str(e)
            if not cot or "FAILED" not in cot[-1]: cot.append(f"PROCESS FAILED: {e}")

        log_entry["timestamp_end"] = datetime.datetime.now().isoformat()
        self.reasoning_history.append(log_entry)
        # MIZ 3.0 TODO: Persist reasoning log
        return log_entry

    def get_reasoning_log(self, reasoning_id):
        """Get the detailed log for a specific reasoning process."""
        # MIZ 3.0 TODO: Retrieve from persistent storage
        for log in reversed(self.reasoning_history):
            if log.get("reasoning_id") == reasoning_id: return log
        return None

    def get_history(self, template_id=None, limit=10):
        """Get reasoning history."""
        # MIZ 3.0 TODO: Retrieve from persistent storage
        if template_id: filtered = [log for log in self.reasoning_history if log.get("template_id") == template_id]
        else: filtered = list(self.reasoning_history)
        return sorted(filtered, key=lambda x: x.get("timestamp_start", ""), reverse=True)[:limit]

# --- Initialization ---
# Assume dependencies _eshkg, _decision_engine, _fm_client, _config are available
_eshkg = eshkg if 'eshkg' in locals() else PlaceholderKG()
_decision_engine = hybrid_decision_engine if 'hybrid_decision_engine' in locals() else None
_fm_client = foundation_model_client if 'foundation_model_client' in locals() else PlaceholderFMClient()
_config = CONFIG if 'CONFIG' in locals() else {}

# Instantiate Technical Flow components
neural_processing = NeuralProcessing(_config, _fm_client) # Pass FM Client
semantic_rag = SemanticGraphRAG(_eshkg, _fm_client, neural_processing) if _eshkg and _fm_client and neural_processing else None
context_rl = ContextAdaptiveRL(state_dim=10, action_dim=5) # Example dimensions
dynamic_experts_logic = DynamicExpertEvolution(_config, input_dim=10, output_dim=2) # Logic component
r2_reasoning = R2Reasoning(kg=_eshkg, decision_engine=_decision_engine, fm_client=_fm_client) # Pass FM Client

# Register default embedding model in NeuralProcessing
if neural_processing:
    def default_embedding_func_wrapper(processed_text):
        # Wrapper to call FM Client's embedding method
        if _fm_client and hasattr(_fm_client, 'generate_embedding'):
            # Use a default LLaMA 4 embedding alias if configured
            alias = _config.get("default_embedding_model_alias", "llama4_embedding_model") # Add this to config
            return _fm_client.generate_embedding(processed_text, model_alias=alias)
        else:
            logger.warning("FM Client unavailable for default embedding func.")
            if isinstance(processed_text, list): return [np.random.rand(768) for _ in processed_text]
            else: return np.random.rand(768)
    neural_processing.register_embedding_model("llama4_embedding_model", default_embedding_func_wrapper)

# Register R2 template (example)
# (Using functions defined in the original Cell 6 init for brevity)
def bid_logic_step1(vars, **kwargs): current_roas = vars.get("current_roas", 0); return {"roas_analysis": "good" if current_roas > 4.0 else "poor"}
def bid_logic_step4(vars, **kwargs):
    adj = 0.0; info = "Neutral."
    if vars.get("roas_analysis") == "good": adj += 0.05; info = "Good ROAS."
    if vars.get("trend_analysis") == "positive": adj += 0.05; info += " Pos trend."
    if vars.get("roas_analysis") == "poor": adj -= 0.05; info = "Poor ROAS."
    if vars.get("trend_analysis") == "negative": adj -= 0.05; info += " Neg trend."
    return {"bid_adjustment": adj, "_info": info.strip()}

bid_template_steps = [
    {"text": "Analyze current ROAS ({current_roas})", "logic": bid_logic_step1},
    {"text": "Evaluate trend ({trend})", "logic": lambda vars, **kwargs: {"trend_analysis": "positive" if vars.get("trend") == "up" else "negative"}},
    # Example LLaMA 4 step: Get market sentiment
    {"text": "Assess market sentiment for {campaign_id}",
     "logic": {"type": "llama4",
               "prompt_template": "Briefly assess the current market sentiment (positive, negative, neutral) for a product related to campaign '{campaign_id}'. Sentiment:",
               "model_alias": "llama4_scout", # Use efficient model
               "output_variable": "market_sentiment"}},
    {"text": "Calculate bid adjustment based on ROAS, trend, sentiment ({market_sentiment})", "logic": bid_logic_step4}, # Modified logic needed to use sentiment
    {"text": "Apply competition factor ({competition})", "logic": lambda vars, **kwargs: {"final_adjustment": vars.get("bid_adjustment", 0) * (1.2 if vars.get("competition") == "high" else 0.8), "_info": f"Competition factor applied."}}
]
bid_template_vars = ["campaign_id", "current_roas", "trend", "competition"]
bid_template_conclusion = "Recommendation for {campaign_id}: Adjust bid by {final_adjustment:.2f}. Market Sentiment: {market_sentiment}"
r2_reasoning.register_template("bid_decision_v2", bid_template_steps, bid_template_vars, bid_template_conclusion)


print("--- MIZ 3.0 Technical Flows Layer Initialized (OKI Enhanced) ---")
if semantic_rag: print("SemanticGraphRAG: Initialized (using LLaMA 4 embeddings/generation).")
else: print("SemanticGraphRAG: Failed (check KG/FMClient/NN).")
print(f"ContextAdaptiveRL: Initialized (Base Class).")
print(f"DynamicExpertEvolution: Initialized logic.")
print(f"NeuralProcessing: Initialized (using LLaMA 4 via FM Client).")
print(f"R2Reasoning: Initialized ({len(r2_reasoning.reasoning_templates)} templates, LLaMA 4 integrated).")
print("-------------------------------------------------------------")

# Example R2 Reasoning Call
# if r2_reasoning:
#     print("\nTesting R2 Reasoning with LLaMA 4 step...")
#     r2_input = {"campaign_id": "C456", "current_roas": 5.1, "trend": "up", "competition": "high"}
#     r2_log = r2_reasoning.reason("bid_decision_v2", r2_input)
#     if r2_log:
#         print(f"R2 Reasoning Result (ID: {r2_log.get('reasoning_id')}):")
#         print(f"  Status: {r2_log.get('status')}")
#         print(f"  Conclusion: {r2_log.get('conclusion')}")
#         print("  Chain of Thought:")
#         for step in r2_log.get('chain_of_thought', []): print(f"    {step}")
#     else:
#         print("R2 Reasoning call failed.")

--- MIZ 3.0 Technical Flows Layer Initialized (OKI Enhanced) ---
SemanticGraphRAG: Failed (check KG/FMClient/NN).
ContextAdaptiveRL: Initialized (Base Class).
DynamicExpertEvolution: Initialized logic.
NeuralProcessing: Initialized (using LLaMA 4 via FM Client).
R2Reasoning: Initialized (1 templates, LLaMA 4 integrated).
-------------------------------------------------------------


In [37]:
# Cell 7: Business Applications Layer (MIZ 3.0 OKI - Reworked for MoA)
# Status: Integrates LLaMA 4 via MoE/FM Client. AWE uses MoA Comms. Privacy uses secure salt. MICA 3.0 placeholders added.

import logging
import time
import datetime
import random
import json
import uuid # Added
import asyncio # Added
from typing import Dict, Any, Optional, List, Union # Added Union
import matplotlib.pyplot as plt

# --- MoA/Orchestrator Dependency ---
# Import the new MoA system components from Cell 15
try:
    from cell15 import EnhancedBaseAgent, UnifiedCommunicationSystem, AgentMessage, MessageType, MIZ_MoA_System
except ImportError:
    logging.warning("Could not import MoA components from Cell 15. Using placeholders.")
    # Add placeholders if needed

# --- Other Dependencies ---
# Assume these are available or use placeholders
# from cell1 import EnhancedConfig, CONFIG
# from cell3 import EnhancedSelfHealingKG
# from cell4 import MixtureOfExpertsManager # Defined in Cell 4
# from cell5 import HolisticOptimizer # Defined in Cell 5
# from cell6 import NeuralProcessing, R2Reasoning, SemanticGraphRAG
# from cell11 import ExplainableAI
# from cell18 import FoundationModelClient

# --- Placeholder Dependencies ---
class PlaceholderKG: pass
class PlaceholderMoEManager:
     expert_registry = {"personalization_model_v1": {}}
     async def invoke_expert(self, *args, **kwargs): return {"recommendations": ["ITEM_A"]}
class PlaceholderXAI: pass
class PlaceholderNN:
     async def get_embedding(self, *args, **kwargs): return [0.1]*10 # Dummy embedding
class PlaceholderFMClient:
     async def generate_text(self, *args, **kwargs): return "LLaMA4 Async Content"
class PlaceholderOptimizer: pass
class PlaceholderR2: pass
class PlaceholderComms:
     async def send_message(self, *args, **kwargs): logger.debug("PlaceholderComms.send_message")
# --- End Placeholders ---

logger = logging.getLogger('MIZ-OKI.BusinessApplications')

# --- Data Pseudonymizer (Copied from Cell 3 for standalone use if needed) ---
import hashlib
class DataPseudonymizer:
    def __init__(self, salt: str):
        if not salt or salt == "default_insecure_salt_replace_me_!!": logger.critical("INSECURE SALT USED FOR PSEUDONYMIZATION!")
        self.salt = salt.encode('utf-8')
        self.sensitive_fields = {"email", "phone", "ip_address", "name", "user_id"}
    def _hash(self, value: str) -> str: return hashlib.sha256(self.salt + str(value).encode('utf-8')).hexdigest()
    def pseudonymize_value(self, key: str, value: Any) -> Any:
        if key in self.sensitive_fields and isinstance(value, str) and value: return f"pseudo_{self._hash(value)[:16]}"
        return value
    def pseudonymize_dict(self, data: Dict) -> Dict:
        if not isinstance(data, dict): return data
        return {k: self.pseudonymize_dict(v) if isinstance(v, dict) else [self.pseudonymize_dict(i) if isinstance(i, dict) else self.pseudonymize_value(k, i) for i in v] if isinstance(v, list) else self.pseudonymize_value(k, v) for k, v in data.items()}

# --- Brand Equity-Aware Bidding (Async) ---
class BrandEquityAwareBidding:
    """ Optimizes bidding asynchronously considering brand equity. """
    def __init__(self, config: Dict, kg: Any, moe_manager: Any, xai: Optional[Any] = None):
        self.config = config; self.kg = kg; self.moe_manager = moe_manager; self.xai = xai
        self.logger = logging.getLogger('MIZ-OKI.BEAB')
        self.logger.info("Brand Equity-Aware Bidding initialized (Async).")

    async def _get_brand_equity_score(self, context):
        self.logger.debug("Fetching brand equity score async (Placeholder).")
        await asyncio.sleep(0.05); return random.uniform(0.5, 0.9)

    async def _get_roas_prediction(self, context):
        expert_id = "roas_forecaster_v1"; self.logger.debug(f"Getting async ROAS prediction via MoE '{expert_id}'.")
        try:
            # Assume invoke_expert is async
            result = await self.moe_manager.invoke_expert(expert_id, context)
            return result["prediction"][0] if result and "prediction" in result and result["prediction"] else 3.0
        except Exception as e: self.logger.error(f"Async ROAS prediction failed: {e}"); return 3.0

    async def calculate_adjusted_bid(self, base_bid, context):
        """ Calculates bid asynchronously, adjusted for brand equity and predicted ROAS. """
        decision_id = f"beab_{uuid.uuid4().hex[:12]}"; start_time = time.time()
        log_entry = {"decision_id": decision_id, "type": "bid_adjustment", "timestamp": datetime.datetime.now().isoformat(), "context": "...", "base_bid": base_bid} # Avoid logging full context

        try:
            # Fetch predictions concurrently
            predicted_roas, brand_equity = await asyncio.gather(
                self._get_roas_prediction(context),
                self._get_brand_equity_score(context)
            )
            roas_target = self.config.get("roas_target", 8.0); equity_weight = self.config.get("beab_equity_weight", 0.2)
            roas_factor = 1.0 + (predicted_roas - roas_target) / roas_target
            equity_factor = 1.0 + (brand_equity - 0.7) * equity_weight
            adjusted_bid = max(self.config.get("rtb_min_bid_threshold", 0.01), base_bid * roas_factor * equity_factor)

            log_entry.update({"predicted_roas": predicted_roas, "brand_equity": brand_equity, "roas_factor": roas_factor, "equity_factor": equity_factor, "adjusted_bid": adjusted_bid, "status": "success"})
            # XAI recording remains synchronous for now, wrap if needed
            if self.xai and hasattr(self.xai, 'record_decision'): self.xai.record_decision(...) # Pass relevant args
            return adjusted_bid
        except Exception as e:
             self.logger.error(f"Failed to calculate adjusted bid async: {e}", exc_info=True)
             log_entry.update({"status": "failed", "error": str(e), "adjusted_bid": base_bid})
             if self.xai and hasattr(self.xai, 'record_decision'): self.xai.record_decision(**log_entry)
             return base_bid
        finally:
             log_entry["duration_ms"] = (time.time() - start_time) * 1000
             # MIZ 3.0 TODO: Persist log_entry async

# --- Hyperdimensional Personalization (Async) ---
class HyperdimensionalPersonalization:
    """ Generates personalized experiences asynchronously using KG embeddings. """
    def __init__(self, config: Dict, kg: Any, nn_processor: Any, moe_manager: Any):
        self.config = config; self.kg = kg; self.nn_processor = nn_processor; self.moe_manager = moe_manager
        self.logger = logging.getLogger('MIZ-OKI.HyperPersonalization')
        self.logger.info("Hyperdimensional Personalization initialized (Async).")

    async def _get_user_embedding(self, user_id):
        self.logger.debug(f"Getting embedding async for user {user_id} (Placeholder).")
        # Assume kg.get_entity and nn_processor.get_embedding are async or wrapped
        user_data = await asyncio.to_thread(self.kg.get_entity, user_id) # Wrap sync call
        if user_data:
             user_text = json.dumps(user_data, default=str)
             embedding = await self.nn_processor.get_embedding(user_text, data_type="user_profile", model_id="llama4_embedding_model") # Assume async
             return embedding
        return None

    async def get_personalized_recommendations(self, user_id, item_catalog, n=5, context=None):
        self.logger.info(f"Generating {n} recommendations async for user {user_id}.")
        user_embedding = await self._get_user_embedding(user_id)
        if user_embedding is None:
            self.logger.warning(f"Could not get embedding for {user_id}. Returning generic."); random.shuffle(item_catalog); return item_catalog[:n]

        expert_id = "personalization_model_v1"; input_data = {"user_id": user_id, "user_embedding": user_embedding.tolist(), "item_catalog": item_catalog, "num_recommendations": n, "context": context or {}}
        try:
            # Assume invoke_expert is async
            result = await self.moe_manager.invoke_expert(expert_id, input_data)
            recommendations = result.get("recommendations", []) if result else []
            self.logger.info(f"Generated {len(recommendations)} recommendations async via expert '{expert_id}'.")
            return recommendations[:n]
        except Exception as e:
            self.logger.error(f"Async recommendation expert '{expert_id}' failed: {e}"); random.shuffle(item_catalog); return item_catalog[:n]

    async def generate_personalized_content(self, user_id, content_type, base_content, context=None):
        """ Generate personalized content asynchronously using LLaMA 4. """
        if not _fm_client: return base_content # Check global client
        self.logger.info(f"Generating personalized '{content_type}' async for user {user_id}.")
        user_profile = await asyncio.to_thread(self.kg.get_entity, user_id) # Wrap sync call
        if not user_profile: return base_content

        profile_summary = json.dumps({k:v for k,v in user_profile.items() if k in ['name', 'interests']}, default=str)
        prompt = f"Personalize '{content_type}' for user: {profile_summary}. Context: {context}. Base: {base_content}. Personalized:"
        try:
            # Assume generate_text is async
            personalized_content = await _fm_client.generate_text(prompt, model_alias="llama4_scout", max_tokens=len(base_content.split())*2+50)
            return personalized_content or base_content
        except Exception as e:
            self.logger.error(f"Async LLaMA 4 content generation failed: {e}"); return base_content

# --- Adaptive Workflow Evolution (Refactored for MoA Comms) ---
class AdaptiveWorkflowEvolution:
    """ Defines, executes (via MoA Comms), and adapts business workflows asynchronously. """
    def __init__(self, config: Dict, communication_system: Optional[UnifiedCommunicationSystem], kg: Any): # Use MoA Comms
        self.config = config
        self.communication = communication_system # Store comms system
        self.kg = kg
        self.workflow_definitions = {}
        self.execution_history = deque(maxlen=1000)
        self.logger = logging.getLogger('MIZ-OKI.AdaptiveWorkflows')
        if not self.communication: self.logger.error("CommunicationSystem not provided. Cannot execute workflows.")
        self.logger.info("Adaptive Workflow Evolution initialized (Async & MoA Integrated).")

    # define_workflow remains synchronous setup

    async def execute_workflow(self, workflow_id, initial_context=None):
        """ Executes a workflow by sending initial messages via Communication System. """
        if not self.communication: return None
        if workflow_id not in self.workflow_definitions: return None
        definition = self.workflow_definitions[workflow_id]
        run_id = f"wf_run_{workflow_id}_{uuid.uuid4().hex[:8]}"
        execution_log = {"run_id": run_id, "workflow_id": workflow_id, "start_time": datetime.datetime.now().isoformat(), "status": "starting", "initial_context": initial_context, "triggered_messages": {}}
        self.execution_history.append(execution_log)
        self.logger.info(f"Starting async workflow execution run '{run_id}' for '{workflow_id}'.")

        try:
            # MIZ 3.0: Send initial message(s) to trigger the workflow, likely to BossAgent
            # For simplicity, assume first step is sent to BossAgent to plan/coordinate
            first_step_info = definition["steps"][0] if definition["steps"] else {}
            initial_task_details = {
                 "task_type": "execute_workflow", # Task for BossAgent
                 "workflow_id": workflow_id,
                 "workflow_definition": definition, # Pass definition or relevant part
                 "initial_context": initial_context or {},
                 "first_step_hint": first_step_info # Optional hint for planner
            }

            # Find BossAgent ID (assuming naming convention or registry access)
            boss_agent_id = next((aid for aid in self.communication.agent_registry_ref if "BossAgent" in aid), None)
            if not boss_agent_id: raise RuntimeError("BossAgent not found in communication system registry.")

            message = AgentMessage(
                sender=f"WorkflowEngine:{workflow_id}", receiver=boss_agent_id,
                message_type=MessageType.TASK_ASSIGNMENT, content=initial_task_details,
                trace_id=run_id # Use run_id as trace_id
            )
            await self.communication.send_message(message)
            execution_log["triggered_messages"]["initial_boss_task"] = message.id
            execution_log["status"] = "submitted_to_boss"
            self.logger.info(f"Run {run_id}: Initial task submitted to BossAgent ({boss_agent_id}), Message ID: {message.id}.")

        except Exception as e:
            self.logger.error(f"Failed to initiate workflow run '{run_id}': {e}", exc_info=True)
            execution_log["status"] = "initiation_failed"; execution_log["error"] = str(e)

        execution_log["initiation_end_time"] = datetime.datetime.now().isoformat()
        # MIZ 3.0 TODO: Persist execution log update async
        return run_id

    async def monitor_and_adapt(self, run_id):
        """ Monitor workflow execution async (Placeholder). """
        # MIZ 3.0 TODO: Implement monitoring by querying task persistence via trace_id=run_id
        self.logger.info(f"Monitoring workflow run {run_id} async (Placeholder).")
        await asyncio.sleep(0.1)

    async def evolve_workflow(self, workflow_id, context=None):
        """ Trigger workflow evolution async via message (Placeholder). """
        self.logger.info(f"Triggering async evolution for workflow '{workflow_id}' (Placeholder).")
        # MIZ 3.0 TODO: Send message to WorkflowEvolutionAgent or BossAgent
        await asyncio.sleep(0.1)

    # get_workflow_definition, get_execution_history remain synchronous

# --- Privacy Controls (Remains largely synchronous for MVP) ---
class PrivacyControls:
    """ Implements data privacy policies and techniques. """
    def __init__(self, config: Dict):
        self.config = config; self.policies = {}
        self.pseudonymizer = DataPseudonymizer(config.get("pseudonymization_salt", "default_insecure_salt_replace_me_!!"))
        self.logger = logging.getLogger('MIZ-OKI.PrivacyControls')
        self.logger.info("Privacy Controls initialized.")
        # MIZ 3.0 TODO: Load policies

    # load_policy remains synchronous setup

    def apply_policy(self, data, source_profile_id, target_profile_id):
        """ Apply privacy policy (MVP: Pseudonymization - Sync). """
        self.logger.debug(f"Applying privacy policy sync: {source_profile_id} -> {target_profile_id}")
        target_policy = self.policies.get(target_profile_id, {})
        if target_policy.get("requires_pseudonymization", True):
            if isinstance(data, list): return [self.pseudonymizer.pseudonymize_dict(item) if isinstance(item, dict) else item for item in data]
            elif isinstance(data, dict): return self.pseudonymizer.pseudonymize_dict(data)
            else: self.logger.warning(f"Cannot pseudonymize data type: {type(data)}"); return data
        return data

# --- Other Business Apps (Placeholders - Async Wrappers) ---
class RealTimeBidding:
    def __init__(self, beab_module: BrandEquityAwareBidding, optimizer: Any): self.beab = beab_module; self.optimizer = optimizer; self.logger = logging.getLogger('MIZ-OKI.RTB')
    async def process_bid_request(self, request):
        self.logger.debug("Processing bid request async (Placeholder)...")
        # Use async BEAB method
        return await self.beab.calculate_adjusted_bid(request.get('base_bid', 0.1), request.get('context', {}))

class AdOptimization:
    def __init__(self, moe_manager: Any, fm_client: Any): self.moe = moe_manager; self.fm = fm_client; self.logger = logging.getLogger('MIZ-OKI.AdOpt')
    async def optimize_creative(self, campaign_id): self.logger.debug(f"Optimizing creative async for {campaign_id}..."); await asyncio.sleep(0.1); return {"best_creative_id": "creative_xyz"}
    async def suggest_budget_shift(self, analysis_context): self.logger.debug("Suggesting budget shift async..."); await asyncio.sleep(0.1); return {"shift_to_campaign": "C789", "amount": 100.0}

class BusinessInsights:
    def __init__(self, kg: Any, r2: Any): self.kg = kg; self.r2 = r2; self.logger = logging.getLogger('MIZ-OKI.BI')
    async def generate_predictive_alert(self, alert_type): self.logger.debug(f"Generating predictive alert async: {alert_type}..."); await asyncio.sleep(0.1); return {"alert": "High churn risk", "confidence": 0.85}

class ExternalIntegration:
    def __init__(self, config: Dict): self.config = config; self.logger = logging.getLogger('MIZ-OKI.EI')
    async def push_data_to_erp(self, data): self.logger.debug("Pushing data to ERP async..."); await asyncio.sleep(0.2); return True
    async def pull_data_from_crm(self, query): self.logger.debug(f"Pulling data from CRM async: {query}..."); await asyncio.sleep(0.2); return [{"crm_id": "123"}]

class GenerativeAIApps:
     def __init__(self, fm_client: Any, hp_engine: 'HyperdimensionalPersonalization'): self.fm = fm_client; self.hp = hp_engine; self.logger = logging.getLogger('MIZ-OKI.GenAI')
     async def generate_ad_copy(self, context): self.logger.debug("Generating ad copy async via LLaMA 4..."); return await self.fm.generate_text("Generate ad copy for context: " + str(context))
     async def generate_email(self, user_id, base_template): self.logger.debug(f"Generating personalized email async for {user_id}..."); return await self.hp.generate_personalized_content(user_id, "email", base_template)

# --- Initialization ---
# Assume dependencies _config, _eshkg, _moe_manager, _xai, _nn_processor, _communication_system, _fm_client, _r2_reasoning, _holistic_optimizer are available
_config = CONFIG if 'CONFIG' in locals() else {}
_eshkg = eshkg if 'eshkg' in locals() else PlaceholderKG()
_moe_manager = moe_manager if 'moe_manager' in locals() else PlaceholderMoEManager()
_xai = xai if 'xai' in locals() else PlaceholderXAI()
_nn_processor = neural_processing if 'neural_processing' in locals() else PlaceholderNN()
_communication_system = miz_moa_system.communication_system if 'miz_moa_system' in locals() and miz_moa_system else PlaceholderComms()
_fm_client = foundation_model_client if 'foundation_model_client' in locals() else PlaceholderFMClient()
_r2_reasoning = r2_reasoning if 'r2_reasoning' in locals() else PlaceholderR2()
_holistic_optimizer = holistic_optimizer if 'holistic_optimizer' in locals() else PlaceholderOptimizer()

# Instantiate Business Application Layer Components (Async & MoA Integrated)
privacy_controls = PrivacyControls(_config)
beab = BrandEquityAwareBidding(_config, _eshkg, _moe_manager, _xai)
hyperpersonalization = HyperdimensionalPersonalization(_config, _eshkg, _nn_processor, _moe_manager)
adaptive_workflows = AdaptiveWorkflowEvolution(_config, _communication_system, _eshkg) # Pass MoA Comms
rtb = RealTimeBidding(beab, _holistic_optimizer)
ad_optimization = AdOptimization(_moe_manager, _fm_client)
business_insights = BusinessInsights(_eshkg, _r2_reasoning)
external_integration = ExternalIntegration(_config)
gen_ai_apps = GenerativeAIApps(_fm_client, hyperpersonalization)

# Example Workflow Definition (Remains Sync Setup)
if adaptive_workflows:
    customer_onboarding_steps = [
        {"id": "step1", "agent_type": "DataProcessingAgent", "task_type": "ingest_crm_data", "task_data": {"source": "signup_form"}},
        {"id": "step2", "agent_type": "PersonalizationAgent", "task_type": "generate_welcome_email", "task_data": {"template": "welcome_v1"}, "depends_on": "step1"},
        {"id": "step3", "agent_type": "CommunicationAgent", "task_type": "send_email", "task_data": {"recipient_ref": "step1.output.email", "content_ref": "step2.output.email_content"}, "depends_on": "step2"}
    ]
    adaptive_workflows.define_workflow("customer_onboarding_v1", customer_onboarding_steps, metadata={"description": "Standard onboarding."})

print("--- MIZ 3.0 Business Applications Layer Initialized (Async & MoA Integrated) ---")
# ... (Print summary as before) ...
print("--------------------------------------------------------------------")

# Example Async Workflow Execution Trigger
# async def trigger_workflow_example():
#     if adaptive_workflows:
#         print("\nTesting Async Workflow Execution via MoA Comms...")
#         run_id = await adaptive_workflows.execute_workflow("customer_onboarding_v1", initial_context={"user_id": "usr_async_456"})
#         if run_id:
#             print(f"Workflow run '{run_id}' submitted to BossAgent. Monitoring status (Placeholder)...")
#             # await asyncio.sleep(5)
#             # await adaptive_workflows.monitor_and_adapt(run_id)
#         else:
#             print("Failed to submit workflow.")
#
# # To run: asyncio.run(trigger_workflow_example())

CRITICAL:MIZ-OKI.BusinessApplications:INSECURE SALT USED FOR PSEUDONYMIZATION!


AttributeError: 'AdaptiveWorkflowEvolution' object has no attribute 'define_workflow'

In [39]:


# Cell 8: Learning Flows Implementation
# Purpose: Implements advanced learning mechanisms for continuous adaptation and improvement.
# OKI Focus: Integrates LLaMA 4, advanced KD/CV/DRL techniques, and aligns rewards with holistic objectives.

import logging
import time
import random # For placeholder drift/bias detection
# import numpy as np # Potentially for statistical tests
# import pandas as pd # Potentially for data handling
# from scipy import stats # Potentially for drift detection
# Assume external libraries for Offline RL / MARL exist if needed
# from some_offline_rl_library import OfflineRLTrainer
# from some_marl_library import MARLCoordinator

# Assuming these components exist and are importable
# from cell_1 import EnhancedConfig # Configuration
# from cell_2_1 import FoundationModelClient # LLaMA 4 Access
# from cell_5 import HolisticOptimizer # For objective alignment
# from cell_6 import ContextAdaptiveRL # Base RL Agent class
# from cell_7 import LearningIntegration # Interface to trigger learning updates

logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

class KnowledgeDistillation:
    """Handles distilling knowledge from a teacher model (LLaMA 4) to student models."""

    def __init__(self, fm_client: 'FoundationModelClient', config: 'EnhancedConfig'):
        self.fm_client = fm_client
        self.config = config
        # Priority: Medium - Use LLaMA 4 (Maverick) as the teacher
        self.teacher_model_id = config.get('learning_flows.kd.teacher_model_id', 'llama4-maverick') # Example ID
        logging.info(f"KnowledgeDistillation initialized with teacher model: {self.teacher_model_id}")

    def distill_knowledge(self, student_model, dataset, distillation_params):
        """
        Performs knowledge distillation from the teacher (LLaMA 4) to the student model.
        """
        logging.info(f"Starting knowledge distillation for student: {type(student_model).__name__}")

        # 1. Generate teacher predictions/logits/embeddings using LLaMA 4
        teacher_outputs = []
        try:
            # Example: Get embeddings or generate soft labels depending on the task
            # This would involve batching data and calling self.fm_client
            # For simplicity, we'll simulate this step.
            logging.info(f"Querying teacher model ({self.teacher_model_id}) via FM Client...")
            # teacher_outputs = self.fm_client.generate_batch(self.teacher_model_id, dataset['inputs'], task_type='distillation')
            # Placeholder:
            teacher_outputs = [random.random() for _ in dataset['inputs']] # Simulate some output
            logging.info("Received outputs from teacher model.")

        except Exception as e:
            logging.error(f"Failed to get outputs from teacher model {self.teacher_model_id}: {e}")
            return False # Indicate failure

        # 2. Train student model using teacher outputs
        logging.info("Training student model using teacher outputs...")
        # student_model.train(dataset['inputs'], teacher_outputs, distillation_params) # Actual training logic here
        time.sleep(1) # Simulate training time
        logging.info("Student model training complete.")

        # 3. Placeholder/Design for Cross-Modal KD (Priority: Low)
        if distillation_params.get('cross_modal', False):
            logging.warning("Cross-Modal KD requested but not fully implemented. Placeholder logic.")
            # Example: LLaMA 4 analyzes text feedback (unstructured) related to structured data predictions.
            # Insights from text analysis could adjust loss function or features for the structured data model.
            # text_insights = self.fm_client.analyze(self.teacher_model_id, dataset['related_text_feedback'])
            # student_model.apply_cross_modal_insights(text_insights)

        return True # Indicate success

class ContinuousValidation:
    """Monitors model performance, detects drift/bias, and processes feedback."""

    def __init__(self, fm_client: 'FoundationModelClient', learning_integration: 'LearningIntegration', config: 'EnhancedConfig'):
        self.fm_client = fm_client
        self.learning_integration = learning_integration
        self.config = config
        self.feedback_queue = [] # Simple in-memory queue for demonstration
        self.monitoring_interval = config.get('learning_flows.cv.monitoring_interval_seconds', 300)
        self.drift_threshold = config.get('learning_flows.cv.drift_threshold', 0.05)
        self.bias_threshold = config.get('learning_flows.cv.bias_threshold', 0.1)
        logging.info("ContinuousValidation initialized.")

    def add_feedback(self, feedback_data):
        """Adds feedback (structured or unstructured) to the queue."""
        self.feedback_queue.append(feedback_data)
        logging.debug(f"Added feedback to queue. Queue size: {len(self.feedback_queue)}")

    def _detect_drift(self, historical_data, current_data) -> bool:
        """
        Placeholder for concept/data drift detection.
        Priority: Medium - Implement using statistical tests or dedicated libraries.
        """
        # Example using a simple random check - REPLACE WITH ACTUAL TESTS (e.g., KS, Chi-squared)
        logging.info("Checking for data/concept drift...")
        # Compare distributions, prediction accuracy, etc.
        # drift_detected = stats.ks_2samp(historical_data['predictions'], current_data['predictions']).pvalue < self.drift_threshold
        drift_detected = random.random() < 0.1 # Simulate 10% chance of drift detection
        if drift_detected:
            logging.warning(f"Potential data/concept drift detected!")
        else:
            logging.info("No significant drift detected.")
        return drift_detected

    def _detect_bias(self, data, sensitive_attributes) -> bool:
        """
        Placeholder for bias detection (e.g., demographic parity, equalized odds).
        Priority: Medium - Implement using fairness metrics libraries.
        """
        logging.info(f"Checking for bias across attributes: {sensitive_attributes}...")
        # Compare performance metrics across different groups defined by sensitive_attributes
        # bias_detected = calculate_fairness_metric(data) > self.bias_threshold
        bias_detected = random.random() < 0.05 # Simulate 5% chance of bias detection
        if bias_detected:
            logging.warning(f"Potential bias detected!")
        else:
            logging.info("No significant bias detected.")
        return bias_detected

    def process_feedback_queue(self):
        """Processes feedback, potentially using LLaMA 4 for unstructured data."""
        logging.info(f"Processing feedback queue ({len(self.feedback_queue)} items)...")
        processed_feedback = []
        while self.feedback_queue:
            item = self.feedback_queue.pop(0)
            if isinstance(item.get('feedback'), str) and item.get('type') == 'unstructured':
                # Priority: Medium - Integrate LLaMA 4 for unstructured feedback analysis
                logging.info("Analyzing unstructured feedback using LLaMA 4...")
                try:
                    # analysis_result = self.fm_client.analyze(
                    #     model_id=self.config.get('learning_flows.cv.feedback_analyzer_model_id', 'llama4-maverick'),
                    #     text=item['feedback'],
                    #     task_type='sentiment_topic_extraction'
                    # )
                    # Placeholder:
                    analysis_result = {'sentiment': random.choice(['positive', 'negative']), 'topic': 'general'}
                    item['analysis'] = analysis_result
                    logging.info(f"LLaMA 4 analysis result: {analysis_result}")
                except Exception as e:
                    logging.error(f"Failed to analyze unstructured feedback with LLaMA 4: {e}")
                    item['analysis'] = {'error': str(e)}
            processed_feedback.append(item)

        if processed_feedback:
            logging.info("Feedback processing complete.")
            # Priority: High - Ensure feedback triggers LearningIntegration
            self.learning_integration.trigger_update(source='continuous_validation', data=processed_feedback)
        else:
            logging.info("Feedback queue is empty.")

    def run_validation_cycle(self, historical_data, current_data, sensitive_attributes=None):
        """Runs a full validation cycle including drift, bias, and feedback processing."""
        logging.info("Starting continuous validation cycle.")
        drift_detected = self._detect_drift(historical_data, current_data)
        bias_detected = False
        if sensitive_attributes:
            bias_detected = self._detect_bias(current_data, sensitive_attributes)

        self.process_feedback_queue()

        if drift_detected or bias_detected:
            logging.warning("Validation cycle detected issues requiring attention (drift or bias).")
            # Trigger retraining or specific interventions via LearningIntegration
            self.learning_integration.trigger_update(
                source='continuous_validation_alert',
                data={'drift_detected': drift_detected, 'bias_detected': bias_detected}
            )
        logging.info("Continuous validation cycle finished.")


class DynamicRewardSystem:
    """Calculates rewards for RL agents, potentially adjusting based on holistic objectives."""

    def __init__(self, holistic_optimizer: 'HolisticOptimizer', config: 'EnhancedConfig'):
        self.holistic_optimizer = holistic_optimizer
        self.config = config
        self.base_reward_weights = config.get('learning_flows.drs.base_weights', {'task_completion': 1.0, 'efficiency': 0.5})
        self.current_reward_weights = self.base_reward_weights.copy()
        logging.info(f"DynamicRewardSystem initialized with base weights: {self.base_reward_weights}")

    def _adjust_weights_for_objectives(self):
        """
        Priority: Medium - Adjust reward weights based on current holistic objectives.
        Fetches priorities from HolisticOptimizer and modulates weights.
        """
        try:
            current_objectives = self.holistic_optimizer.get_current_objective_priorities()
            logging.info(f"Fetched current objective priorities: {current_objectives}")

            # Example modulation logic: Increase weight for rewards aligned with high-priority objectives
            adjusted_weights = self.base_reward_weights.copy()
            for objective, priority in current_objectives.items():
                # This mapping needs to be defined based on how rewards relate to objectives
                if objective == 'maximize_ROAS' and 'efficiency' in adjusted_weights:
                    adjusted_weights['efficiency'] *= (1 + priority * 0.5) # Boost efficiency weight based on ROAS priority
                elif objective == 'minimize_CAC' and 'task_completion' in adjusted_weights:
                     # Example: Maybe completing tasks faster reduces interaction cost?
                    adjusted_weights['task_completion'] *= (1 + priority * 0.2)

            self.current_reward_weights = adjusted_weights
            logging.info(f"Adjusted reward weights based on objectives: {self.current_reward_weights}")

        except Exception as e:
            logging.error(f"Failed to adjust reward weights based on objectives: {e}")
            # Fallback to last known weights or base weights
            self.current_reward_weights = self.base_reward_weights.copy()


    def calculate_reward(self, state, action, next_state, outcome_metrics):
        """Calculates the reward for a given transition based on current weights."""
        self._adjust_weights_for_objectives() # Adjust weights before calculating reward

        total_reward = 0
        for metric, value in outcome_metrics.items():
            if metric in self.current_reward_weights:
                total_reward += value * self.current_reward_weights[metric]

        logging.debug(f"Calculated reward: {total_reward} based on metrics: {outcome_metrics} and weights: {self.current_reward_weights}")
        return total_reward

class DistributedReinforcementLearning:
    """Manages distributed RL agents, incorporating Offline RL and MARL concepts."""

    def __init__(self, reward_system: DynamicRewardSystem, learning_integration: 'LearningIntegration', config: 'EnhancedConfig'):
        self.reward_system = reward_system
        self.learning_integration = learning_integration
        self.config = config
        self.agents = {} # Dictionary to store registered RL agents
        self.experience_buffer = [] # Shared or distributed buffer
        self.offline_rl_enabled = config.get('learning_flows.drl.enable_offline_rl', False)
        self.marl_enabled = config.get('learning_flows.drl.enable_marl', False)
        # self.offline_trainer = OfflineRLTrainer() if self.offline_rl_enabled else None # Priority: Medium
        # self.marl_coordinator = MARLCoordinator() if self.marl_enabled else None # Priority: Low

        logging.info(f"DistributedReinforcementLearning initialized. Offline RL: {self.offline_rl_enabled}, MARL: {self.marl_enabled}")

    def register_agent(self, agent_id: str, agent_instance: 'ContextAdaptiveRL'):
        """Registers an RL agent."""
        # Priority: Medium - Ensure agents use ContextAdaptiveRL or enhanced version
        if not isinstance(agent_instance, ContextAdaptiveRL):
             logging.warning(f"Agent {agent_id} is not an instance of ContextAdaptiveRL. Ensure compatibility.")
        self.agents[agent_id] = agent_instance
        logging.info(f"Registered RL agent: {agent_id}")

    def collect_experience(self, agent_id, state, action, reward, next_state, done, info):
        """Collects experience from agents and stores it."""
        experience = (state, action, reward, next_state, done, info)
        self.experience_buffer.append(experience)
        # Potentially push to a distributed buffer service here

    def train_agents(self):
        """Triggers training for registered agents (online, offline, MARL)."""
        logging.info("Starting training cycle for RL agents...")

        # Online Training (example for each agent)
        for agent_id, agent in self.agents.items():
             if hasattr(agent, 'learn_online') and agent.is_ready_to_learn():
                 logging.info(f"Triggering online learning for agent {agent_id}")
                 # agent.learn_online(self.experience_buffer) # Agent pulls relevant data

        # Offline RL Training (Priority: Medium)
        if self.offline_rl_enabled: # and self.offline_trainer:
            logging.info("Performing Offline RL training step...")
            # self.offline_trainer.train(self.experience_buffer)
            # updated_policies = self.offline_trainer.get_updated_policies()
            # Distribute updated policies to relevant agents or trigger LearningIntegration
            # self.learning_integration.trigger_update(source='offline_rl', data=updated_policies)
            logging.warning("Offline RL training logic placeholder.") # Placeholder message

        # MARL Coordination (Priority: Low)
        if self.marl_enabled: # and self.marl_coordinator:
            logging.info("Performing MARL coordination step...")
            # self.marl_coordinator.coordinate(self.agents, self.experience_buffer)
            # Coordination might involve sharing state, negotiating joint actions, or centralized training
            logging.warning("MARL coordination logic placeholder.") # Placeholder message

        logging.info("RL training cycle finished.")

# Example Usage (Conceptual)
# config = EnhancedConfig()
# fm_client = FoundationModelClient(config)
# holistic_optimizer = HolisticOptimizer(config) # Assume initialized
# learning_integration = LearningIntegration(config) # Assume initialized

# kd = KnowledgeDistillation(fm_client, config)
# cv = ContinuousValidation(fm_client, learning_integration, config)
# drs = DynamicRewardSystem(holistic_optimizer, config)
# drl = DistributedReinforcementLearning(drs, learning_integration, config)

# # Example RL agent (needs definition based on ContextAdaptiveRL)
# # rl_agent_1 = ContextAdaptiveRL(...)
# # drl.register_agent("marketing_bidder", rl_agent_1)

# # --- Simulation ---
# # cv.add_feedback({'type': 'unstructured', 'feedback': 'The campaign visuals are confusing.'})
# # cv.run_validation_cycle(historical_data={}, current_data={}, sensitive_attributes=['geo'])
# # reward = drs.calculate_reward(state={}, action={}, next_state={}, outcome_metrics={'task_completion': 1, 'efficiency': 0.8})
# # drl.collect_experience("marketing_bidder", {}, {}, reward, {}, False, {})
# # drl.train_agents()
# # kd.distill_knowledge(student_model=None, dataset={'inputs': [...]}, distillation_params={})




In [40]:

# Cell 9: System Integration and Testing
# Purpose: Provides comprehensive tests to ensure all OKI components integrate and function correctly.
# OKI Focus: Covers LLaMA 4 integration, MoA interactions, B.O.S.S. loop, and infrastructure reliability.

import unittest
import logging
# from unittest.mock import patch, MagicMock # Useful for mocking dependencies

# Assume necessary components are importable
# from cell_1 import EnhancedConfig
# from cell_2_1 import FoundationModelClient
# from cell_3 import KnowledgeGraphAdapter
# from cell_4 import OrchestratorClient, Task # Assuming Task object definition
# from cell_5 import HolisticOptimizer, HybridDecisionEngine
# from cell_6 import MoEManager, R2Reasoning, ContextAdaptiveRL, BossAgent # Assuming Agent structure
# from cell_7 import LearningIntegration, DataFlowManager
# from cell_8 import KnowledgeDistillation, ContinuousValidation, DynamicRewardSystem, DistributedReinforcementLearning
# from cell_10 import BusinessImpactDashboard
# from cell_11 import ExplainableAI

logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

# Mock objects for dependencies (replace with actual imports or more sophisticated mocks)
class MockEnhancedConfig:
    def get(self, key, default=None):
        # Simple mock config
        if key == 'knowledge_graph.uri': return "mock_uri"
        if key == 'orchestrator.queue_name': return "mock_queue"
        if key == 'foundation_model.default_id': return "mock_llama4"
        return default

class MockKGAdapter:
    def connect(self): return True
    def bulk_load_data(self, data): logging.info(f"Mock KG: Bulk loading {len(data)} items."); return True
    def query(self, query): logging.info(f"Mock KG: Running query: {query}"); return [{'mock_result': 1}]

class MockOrchestratorClient:
    def enqueue_task(self, task): logging.info(f"Mock Orchestrator: Enqueuing task: {task.task_type}"); return True
    def dequeue_task(self): logging.info("Mock Orchestrator: Dequeuing task."); return None # Simulate empty queue initially

class MockFoundationModelClient:
    def generate(self, model_id, prompt, **kwargs): logging.info(f"Mock FMClient: Generating for {model_id}"); return "Mock generation"
    def embed(self, model_id, text, **kwargs): logging.info(f"Mock FMClient: Embedding for {model_id}"); return [0.1, 0.2, 0.3]
    def analyze(self, model_id, text, **kwargs): logging.info(f"Mock FMClient: Analyzing for {model_id}"); return {'sentiment': 'neutral'}

class MockBossAgent:
     def __init__(self, orchestrator_client, fm_client, config): pass
     def process_task(self, task): logging.info(f"Mock BossAgent: Processing task {task.task_id}"); return "Processed"

class MockTask:
    def __init__(self, task_id, task_type, data):
        self.task_id = task_id
        self.task_type = task_type
        self.data = data

# --- Test Suite ---

class TestMIZOKIIntegration(unittest.TestCase):
    """Comprehensive integration tests for the MIZ 3.0 OKI Platform."""

    @classmethod
    def setUpClass(cls):
        """Set up shared resources for all tests."""
        logging.info("Setting up MIZ OKI Integration Test Suite...")
        cls.config = MockEnhancedConfig()
        cls.kg_adapter = MockKGAdapter()
        cls.orchestrator_client = MockOrchestratorClient()
        cls.fm_client = MockFoundationModelClient()
        # Instantiate other components if needed for deeper tests, potentially mocking their dependencies

    def test_01_config_loading(self):
        """Test if configuration seems accessible."""
        logging.info("Running test: test_01_config_loading")
        self.assertIsNotNone(self.config.get('knowledge_graph.uri'))
        self.assertEqual(self.config.get('orchestrator.queue_name'), "mock_queue")
        logging.info("Config loading test passed.")

    def test_02_infrastructure_kg_connectivity(self):
        """Test basic Knowledge Graph connectivity and operations."""
        logging.info("Running test: test_02_infrastructure_kg_connectivity")
        self.assertTrue(self.kg_adapter.connect())
        # Test bulk loading (basic check)
        self.assertTrue(self.kg_adapter.bulk_load_data([{'id': 1, 'prop': 'a'}, {'id': 2, 'prop': 'b'}]))
        # Test querying (basic check)
        result = self.kg_adapter.query("MATCH (n) RETURN count(n)")
        self.assertIsInstance(result, list)
        self.assertGreaterEqual(len(result), 0) # Allow empty list or results
        logging.info("KG connectivity test passed.")

    def test_03_infrastructure_orchestrator_queue(self):
        """Test basic Orchestrator queue operations."""
        logging.info("Running test: test_03_infrastructure_orchestrator_queue")
        task = MockTask(task_id="t123", task_type="data_ingestion", data={'source': 'crm'})
        self.assertTrue(self.orchestrator_client.enqueue_task(task))
        # In a real scenario, dequeue might need a wait or a setup where a task exists
        # For this mock, dequeue returns None, which is a valid state (empty queue)
        dequeued_task = self.orchestrator_client.dequeue_task()
        self.assertIsNone(dequeued_task) # Or assert based on mock setup
        logging.info("Orchestrator queue test passed.")

    def test_04_llama4_invocation(self):
        """Test invoking LLaMA 4 via FoundationModelClient for different tasks."""
        logging.info("Running test: test_04_llama4_invocation")
        model_id = self.config.get('foundation_model.default_id')
        # Test generation
        generation_result = self.fm_client.generate(model_id, "Write a headline")
        self.assertIsInstance(generation_result, str)
        self.assertTrue(len(generation_result) > 0)
        # Test embedding
        embedding_result = self.fm_client.embed(model_id, "Some text to embed")
        self.assertIsInstance(embedding_result, list)
        self.assertTrue(all(isinstance(x, float) for x in embedding_result))
        # Test analysis (example)
        analysis_result = self.fm_client.analyze(model_id, "This is feedback text.")
        self.assertIsInstance(analysis_result, dict)
        self.assertIn('sentiment', analysis_result)
        logging.info("LLaMA 4 invocation test passed.")

    # @patch('cell_6.SomeSpecificAgent') # Example if mocking a specific agent needed
    # @patch('cell_4.OrchestratorClient') # Example if mocking orchestrator for this test
    def test_05_moa_interaction_simulation(self):
        """Simulate a multi-step task involving BossAgent, agents, and orchestrator."""
        logging.info("Running test: test_05_moa_interaction_simulation")
        # This test is highly conceptual without the full agent implementations
        # 1. Enqueue an initial task
        initial_task = MockTask(task_id="maintask_001", task_type="marketing_campaign_brief", data={'goal': 'Increase signups'})
        self.orchestrator_client.enqueue_task(initial_task)

        # 2. Simulate BossAgent picking up and decomposing (conceptual)
        # Assume BossAgent dequeues, decides subtasks, enqueues them
        # For this test, we'll manually create subtasks BossAgent might create
        subtask1 = MockTask(task_id="subtask_001a", task_type="generate_ad_copy", data={'brief_id': 'maintask_001'})
        subtask2 = MockTask(task_id="subtask_001b", task_type="target_audience_analysis", data={'brief_id': 'maintask_001'})
        self.orchestrator_client.enqueue_task(subtask1)
        self.orchestrator_client.enqueue_task(subtask2)

        # 3. Simulate specific agents processing subtasks (conceptual)
        # An agent would dequeue subtask1, use FMClient (LLaMA 4) to generate copy, store result (e.g., in KG), mark task done.
        # Another agent would dequeue subtask2, query data sources (maybe via DataFlowManager), store analysis.

        # 4. Assertions (conceptual) - check if expected artifacts were created (e.g., in mock KG) or tasks completed
        # self.kg_adapter.query("MATCH (a:AdCopy {brief_id: 'maintask_001'}) RETURN a") -> check result
        # self.kg_adapter.query("MATCH (a:AudienceAnalysis {brief_id: 'maintask_001'}) RETURN a") -> check result
        logging.warning("MoA interaction test is conceptual and requires full agent implementations for validation.")
        self.assertTrue(True) # Placeholder assertion
        logging.info("MoA interaction simulation test passed (conceptually).")

    def test_06_boss_loop_component_check(self):
        """Test a specific component within the B.O.S.S. loop, e.g., HybridDecisionEngine."""
        logging.info("Running test: test_06_boss_loop_component_check")
        # Requires instantiation of HDE, potentially with mocks for its dependencies (MoEManager, R2Reasoning, etc.)
        # hde = HybridDecisionEngine(moe_manager=MockMoEManager(), r2_reasoning=MockR2Reasoning(), ...)
        # context = {'data': ..., 'objective': ...}
        # decision = hde.make_decision(context)
        # self.assertIsNotNone(decision)
        # self.assertIn('action', decision)
        logging.warning("B.O.S.S. loop component test requires component implementation.")
        self.assertTrue(True) # Placeholder assertion
        logging.info("B.O.S.S. loop component test passed (conceptually).")

    @classmethod
    def tearDownClass(cls):
        """Clean up resources after all tests."""
        logging.info("Tearing down MIZ OKI Integration Test Suite.")
        # Close connections, delete temporary files, etc.

# To run the tests (if saved as a file, e.g., test_integration.py):
# python -m unittest test_integration.py



In [41]:


# Cell 10: Business Impact Monitoring
# Purpose: Tracks key business KPIs to measure the real-world impact and value generated by the OKI platform.
# OKI Focus: Monitors specified OKI KPIs (CAC, ROAS, CLV, etc.) and integrates with real data sources (KG, Data Flows).

import logging
import pandas as pd # Using pandas for potential data manipulation and structuring
# import matplotlib.pyplot as plt # Optional: for plotting directly
# import plotly.express as px # Optional: for interactive plots

# Assume necessary components are importable
# from cell_1 import EnhancedConfig
# from cell_3 import KnowledgeGraphAdapter
# from cell_7 import DataFlowManager

logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

class BusinessImpactDashboard:
    """Monitors and visualizes key business KPIs impacted by the OKI platform."""

    def __init__(self, config: 'MockEnhancedConfig', kg_adapter: 'MockKGAdapter', data_flow_manager: 'DataFlowManager' = None):
        self.config = config
        self.kg_adapter = kg_adapter
        self.data_flow_manager = data_flow_manager # Optional, depending on data source
        self.target_metrics = {}
        self.actual_metrics_history = pd.DataFrame() # Store historical data
        self._load_config_metrics()
        logging.info(f"BusinessImpactDashboard initialized. Tracking metrics: {list(self.target_metrics.keys())}")

    def _load_config_metrics(self):
        """Loads the target KPIs and their configurations from EnhancedConfig."""
        # Priority: Configure OKI Metrics
        self.target_metrics = self.config.get('business_impact.kpis', {
            # Example KPIs from OKI spec - define structure in config
            'customer_acquisition_cost': {'target': 50, 'unit': 'USD', 'lower_is_better': True, 'data_source': 'kg', 'query': 'FETCH_CAC_DATA'},
            'return_on_ad_spend': {'target': 4.0, 'unit': 'ratio', 'lower_is_better': False, 'data_source': 'kg', 'query': 'FETCH_ROAS_DATA'},
            'customer_lifetime_value': {'target': 500, 'unit': 'USD', 'lower_is_better': False, 'data_source': 'kg', 'query': 'FETCH_CLV_DATA'},
            'campaign_conversion_rate': {'target': 0.05, 'unit': 'percentage', 'lower_is_better': False, 'data_source': 'data_flow', 'flow_name': 'CalculateConversionRateFlow'},
            # Add other relevant KPIs
        })
        logging.info("Loaded target KPI configurations.")

    def import_actual_metrics(self, time_period):
        """
        Imports actual metric values for a given time period from configured sources.
        Priority: Medium - Implement real data import logic.
        """
        logging.info(f"Importing actual metrics for time period: {time_period}")
        new_metrics = {'time_period': time_period}
        import_successful = True

        for metric_name, metric_config in self.target_metrics.items():
            data_source = metric_config.get('data_source')
            metric_value = None
            try:
                if data_source == 'kg':
                    query_name = metric_config.get('query')
                    if query_name and self.kg_adapter:
                        # Example: Query needs to be defined to aggregate data in KG
                        # query = f"MATCH (m:Metric {{name: '{metric_name}', period: '{time_period}'}}) RETURN m.value"
                        # result = self.kg_adapter.query(query) # This needs refinement based on KG schema
                        # Placeholder query result simulation
                        logging.info(f"Querying KG for metric: {metric_name} (Query: {query_name})")
                        # Simulate KG returning a value based on metric type
                        if metric_name == 'customer_acquisition_cost': metric_value = random.uniform(45, 60)
                        elif metric_name == 'return_on_ad_spend': metric_value = random.uniform(3.5, 4.5)
                        elif metric_name == 'customer_lifetime_value': metric_value = random.uniform(480, 550)
                        else: metric_value = random.random() # Default random value
                        logging.info(f"Retrieved value from KG: {metric_value}")
                    else:
                        logging.warning(f"KG Adapter not available or query not specified for metric: {metric_name}")

                elif data_source == 'data_flow':
                    flow_name = metric_config.get('flow_name')
                    if flow_name and self.data_flow_manager:
                        logging.info(f"Triggering data flow for metric: {metric_name} (Flow: {flow_name})")
                        # result = self.data_flow_manager.run_flow_sync(flow_name, params={'time_period': time_period})
                        # metric_value = result.get('metric_value') if result else None
                        # Placeholder simulation
                        metric_value = random.uniform(0.04, 0.06) if metric_name == 'campaign_conversion_rate' else random.random()
                        logging.info(f"Retrieved value from Data Flow: {metric_value}")
                    else:
                        logging.warning(f"Data Flow Manager not available or flow name not specified for metric: {metric_name}")

                elif data_source == 'manual' or data_source is None:
                     logging.warning(f"Metric '{metric_name}' requires manual input or has no data source configured.")
                     # Allow manual input or skip
                     metric_value = None # Or fetch from a manual input source if implemented

                else:
                    logging.error(f"Unsupported data source '{data_source}' for metric: {metric_name}")

                if metric_value is not None:
                    new_metrics[metric_name] = metric_value
                else:
                    logging.warning(f"Could not retrieve value for metric: {metric_name}")
                    # Decide how to handle missing data (e.g., skip, use NaN, previous value)
                    new_metrics[metric_name] = pd.NA # Use pandas NA for missing data

            except Exception as e:
                logging.error(f"Failed to import metric '{metric_name}' from {data_source}: {e}")
                new_metrics[metric_name] = pd.NA
                import_successful = False

        # Append new data to history
        new_row = pd.DataFrame([new_metrics])
        self.actual_metrics_history = pd.concat([self.actual_metrics_history, new_row], ignore_index=True)
        logging.info(f"Finished importing metrics. Current history size: {len(self.actual_metrics_history)} rows.")
        return import_successful

    def calculate_kpi_trends(self, window=5):
        """Calculates trends or rolling averages for KPIs."""
        trends = {}
        if len(self.actual_metrics_history) >= window:
            for metric_name in self.target_metrics.keys():
                if metric_name in self.actual_metrics_history.columns:
                    rolling_avg = self.actual_metrics_history[metric_name].rolling(window=window).mean().iloc[-1]
                    trends[f"{metric_name}_rolling_avg_{window}"] = rolling_avg
        return trends

    def generate_dashboard_data(self, time_period=None):
        """Generates data structure for dashboard display (can be used by a UI framework)."""
        if time_period:
            latest_data = self.actual_metrics_history[self.actual_metrics_history['time_period'] == time_period]
        else:
            latest_data = self.actual_metrics_history.tail(1)

        if latest_data.empty:
            logging.warning("No data available to generate dashboard.")
            return {}

        dashboard_data = {"summary": {}, "history": self.actual_metrics_history.to_dict('records')}
        latest_row = latest_data.iloc[0]

        for metric_name, config in self.target_metrics.items():
            if metric_name in latest_row and pd.notna(latest_row[metric_name]):
                actual = latest_row[metric_name]
                target = config.get('target')
                status = "N/A"
                if target is not None:
                    is_better = (actual <= target) if config.get('lower_is_better', False) else (actual >= target)
                    status = "Meeting Target" if is_better else "Below Target"
                dashboard_data["summary"][metric_name] = {
                    "actual": actual,
                    "target": target,
                    "unit": config.get('unit', ''),
                    "status": status
                }
            else:
                 dashboard_data["summary"][metric_name] = {
                    "actual": "N/A",
                    "target": config.get('target'),
                    "unit": config.get('unit', ''),
                    "status": "Data Missing"
                }

        # Add trends
        dashboard_data["trends"] = self.calculate_kpi_trends()

        logging.info("Generated dashboard data structure.")
        # In a real application, this data would feed a visualization library or API endpoint
        # self.plot_kpis(dashboard_data) # Optional: call plotting function
        return dashboard_data

    # Optional: Add plotting functions using matplotlib or plotly if needed
    # def plot_kpis(self, dashboard_data): ...

# Example Usage (Conceptual)
# config = MockEnhancedConfig() # Use mock or real EnhancedConfig
# kg_adapter = MockKGAdapter() # Use mock or real KGAdapter
# data_flow_manager = None # Mock or real DataFlowManager if needed

# dashboard = BusinessImpactDashboard(config, kg_adapter, data_flow_manager)
# dashboard.import_actual_metrics(time_period="2025-Q1")
# dashboard.import_actual_metrics(time_period="2025-Q2")
# dashboard_data = dashboard.generate_dashboard_data()
# print(dashboard_data) # Print the generated data structure





In [42]:


# Cell 11: Explainable AI (XAI)
# Purpose: Provides mechanisms to understand and interpret the decisions and predictions made by the OKI system.
# OKI Focus: Implements CoT logging, SHAP/LIME integration, Counterfactuals, MoE analysis (placeholder), and decision recording.

import logging
import json
import datetime
# import shap # Placeholder: Requires shap library installation
# import lime # Placeholder: Requires lime library installation
# import lime.lime_tabular # Placeholder

# Assume necessary components are importable
# from cell_1 import EnhancedConfig
# from cell_2_1 import FoundationModelClient
# from cell_3 import KnowledgeGraphAdapter
# from cell_6 import MoEManager # If needed for model access or MoE analysis

logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

class ExplainableAI:
    """Provides XAI capabilities for the OKI platform."""

    def __init__(self, config: 'MockEnhancedConfig', fm_client: 'MockFoundationModelClient', kg_adapter: 'MockKGAdapter', moe_manager: 'MoEManager' = None):
        self.config = config
        self.fm_client = fm_client # Needed for counterfactuals
        self.kg_adapter = kg_adapter # Needed for storing/retrieving decision logs/CoT
        self.moe_manager = moe_manager # Potentially needed for SHAP/LIME model access or MoE analysis
        self.xai_storage_type = config.get('xai.storage_type', 'kg') # 'kg', 'log_file', 'database'
        self.log_file_path = config.get('xai.log_file_path', 'xai_decisions.log')
        logging.info(f"ExplainableAI initialized. Storage type: {self.xai_storage_type}")

    def record_decision(self, decision_id: str, component: str, context: dict, decision: dict, inputs: dict, outputs: dict, chain_of_thought: list = None, model_used: str = None, timestamp=None):
        """
        Records the details of a decision for later explanation.
        Priority: High - Ensure key components call this method.
        """
        if timestamp is None:
            timestamp = datetime.datetime.utcnow().isoformat()

        record = {
            'decision_id': decision_id,
            'timestamp': timestamp,
            'component': component, # e.g., 'HybridDecisionEngine', 'BEAB', 'RTB'
            'model_used': model_used, # e.g., 'llama4-maverick', 'specific_agent_model'
            'context': context, # Input context/state
            'inputs': inputs, # Specific inputs to the model/logic
            'decision': decision, # The decision made (e.g., action, prediction)
            'outputs': outputs, # Raw outputs if different from decision
            'chain_of_thought': chain_of_thought # List of reasoning steps (CoT)
        }

        logging.info(f"Recording decision: {decision_id} from component: {component}")
        try:
            if self.xai_storage_type == 'kg':
                # Requires a specific KG schema for XAI records
                # Example: Create a node :DecisionLog and connect it to context, component etc.
                # self.kg_adapter.save_decision_record(record) # Assumes method exists in adapter
                logging.info(f"Storing decision {decision_id} in Knowledge Graph (simulated).")
                # Simulate storing by logging the record content
                # print(json.dumps(record, indent=2, default=str)) # Print for debug/simulation
                pass # Placeholder for actual KG write operation

            elif self.xai_storage_type == 'log_file':
                with open(self.log_file_path, 'a') as f:
                    f.write(json.dumps(record, default=str) + '\n')
            else:
                 logging.warning(f"Unsupported XAI storage type: {self.xai_storage_type}")

        except Exception as e:
            logging.error(f"Failed to record decision {decision_id}: {e}")

    def _retrieve_decision_log(self, decision_id: str) -> dict | None:
        """Helper to retrieve a specific decision record."""
        logging.info(f"Retrieving decision log for ID: {decision_id}")
        if self.xai_storage_type == 'kg':
            # query = f"MATCH (d:DecisionLog {{decision_id: '{decision_id}'}}) RETURN d"
            # result = self.kg_adapter.query(query) # Assumes method exists
            # return result[0]['d'] if result else None
            logging.warning("KG retrieval for decision log not implemented (simulation).")
            # Simulate finding a record (replace with actual retrieval)
            if decision_id == "example_decision_id_cot":
                 return {
                    'decision_id': 'example_decision_id_cot', 'timestamp': datetime.datetime.utcnow().isoformat(),
                    'component': 'R2Reasoning', 'model_used': 'llama4-maverick',
                    'context': {'user_query': 'Why was my ad rejected?'}, 'inputs': {'policy_docs': '[docs]', 'ad_content': '[content]'},
                    'decision': {'status': 'rejected', 'reason_code': 'POLICY_VIOLATION'}, 'outputs': {'raw_llm_output': '...'},
                    'chain_of_thought': [
                        "Step 1: Identify relevant policy sections based on ad content.",
                        "Step 2: Compare ad content against policy criteria.",
                        "Step 3: Found violation regarding 'prohibited claims'.",
                        "Step 4: Formulate rejection reason based on violation."
                    ]}
            return None # Simulate not found
        elif self.xai_storage_type == 'log_file':
            try:
                with open(self.log_file_path, 'r') as f:
                    for line in f:
                        record = json.loads(line)
                        if record.get('decision_id') == decision_id:
                            return record
            except FileNotFoundError:
                logging.error(f"XAI log file not found: {self.log_file_path}")
            except Exception as e:
                logging.error(f"Error reading XAI log file: {e}")
            return None
        else:
            logging.error(f"Cannot retrieve decision log: Unsupported storage type {self.xai_storage_type}")
            return None


    def explain_decision(self, decision_id: str, method: str = 'chain_of_thought', **kwargs):
        """
        Generates an explanation for a recorded decision using the specified method.
        """
        logging.info(f"Generating explanation for decision {decision_id} using method: {method}")
        decision_log = self._retrieve_decision_log(decision_id)

        if not decision_log:
            return f"Explanation failed: Decision log not found for ID: {decision_id}"

        try:
            if method == 'chain_of_thought':
                # Priority: High - Retrieve and format CoT logs
                cot = decision_log.get('chain_of_thought')
                if cot:
                    explanation = f"Decision Explanation (Chain of Thought) for {decision_id}:\n"
                    explanation += f"Component: {decision_log.get('component')}\n"
                    explanation += f"Decision: {json.dumps(decision_log.get('decision'), default=str)}\n"
                    explanation += "Reasoning Steps:\n" + "\n".join([f"- {step}" for step in cot])
                    return explanation
                else:
                    return f"Explanation failed: No Chain of Thought recorded for decision {decision_id}."

            elif method == 'shap' or method == 'lime':
                # Priority: Medium - Integrate SHAP/LIME libraries
                logging.warning(f"SHAP/LIME explanation method requires integration with respective libraries and access to the model and data instance used for the decision.")
                # Conceptual Steps:
                # 1. Identify the model used (decision_log['model_used'])
                # 2. Get access to the model (e.g., via MoEManager or load from path)
                # 3. Get the specific input instance (decision_log['inputs'] or reconstruct from context)
                # 4. Get background/training data for SHAP/LIME explainer initialization
                # 5. Initialize SHAP/LIME explainer (e.g., shap.KernelExplainer, lime.lime_tabular.LimeTabularExplainer)
                # 6. Generate explanation (e.g., explainer.shap_values(), explainer.explain_instance())
                # 7. Format and return the explanation (feature importances, local prediction explanation)
                # model = self.moe_manager.get_model_predictor(decision_log['model_used']) # Needs implementation
                # data_instance = decision_log['inputs']['vector'] # Example input
                # background_data = ... # Needs access to training/background data
                # if method == 'shap':
                #    explainer = shap.KernelExplainer(model.predict, background_data)
                #    shap_values = explainer.shap_values(data_instance)
                #    return f"SHAP Explanation (Feature Importances): {shap_values}" # Format appropriately
                # else: # lime
                #    explainer = lime.lime_tabular.LimeTabularExplainer(...)
                #    explanation = explainer.explain_instance(...)
                #    return f"LIME Explanation: {explanation.as_list()}" # Format appropriately
                return f"Explanation failed: {method.upper()} method not fully implemented. Requires model access and data instance."

            elif method == 'counterfactual':
                 # Priority: Medium - Integrate LLaMA 4 for counterfactual generation
                logging.info("Generating counterfactual explanation using LLaMA 4...")
                context_summary = json.dumps(decision_log.get('context'), default=str)
                decision_summary = json.dumps(decision_log.get('decision'), default=str)
                prompt = (f"Given the context: {context_summary}\n"
                          f"The system made the decision: {decision_summary}\n"
                          f"Generate a counterfactual explanation: What minimal change in the context or inputs "
                          f"would have led to a different desirable outcome? Explain why.")

                try:
                    # counterfactual_explanation = self.fm_client.generate(
                    #     model_id=self.config.get('xai.counterfactual_model_id', 'llama4-maverick'),
                    #     prompt=prompt,
                    #     max_tokens=200
                    # )
                    # Placeholder:
                    counterfactual_explanation = f"Simulated Counterfactual: If the 'user_budget' in the context was increased by 10%, the decision might have been 'approved' because it would meet the minimum threshold."
                    return f"Counterfactual Explanation for {decision_id}:\n{counterfactual_explanation}"
                except Exception as e:
                    logging.error(f"LLaMA 4 counterfactual generation failed: {e}")
                    return f"Explanation failed: Could not generate counterfactual explanation due to error: {e}"

            elif method == 'moe_activation':
                # Priority: Low - Requires deeper integration/access to model internals
                logging.warning("MoE Activation analysis is conceptual and requires access to model internals (potentially future work).")
                return "Explanation failed: MoE Activation analysis not implemented."

            else:
                return f"Explanation failed: Unsupported explanation method '{method}'."

        except Exception as e:
            logging.exception(f"An error occurred during explanation generation for {decision_id}: {e}")
            return f"Explanation failed: An unexpected error occurred: {e}"

    def provide_role_based_explanation(self, decision_id: str, role: str, method: str = 'chain_of_thought'):
        """Generates an explanation tailored to a specific user role."""
        base_explanation = self.explain_decision(decision_id, method)

        # TODO: Implement logic to tailor the explanation based on the role
        # e.g., simplify technical details for business users, provide more model specifics for data scientists
        logging.info(f"Tailoring explanation for role: {role} (placeholder)")
        tailored_explanation = f"[{role} View]\n{base_explanation}" # Basic prefixing for now

        return tailored_explanation


# Example Usage (Conceptual)
# config = MockEnhancedConfig()
# fm_client = MockFoundationModelClient()
# kg_adapter = MockKGAdapter()
# xai = ExplainableAI(config, fm_client, kg_adapter)

# # 1. Record a decision (e.g., from R2Reasoning or HDE)
# decision_details = {
#     'decision_id': 'example_decision_id_cot', 'component': 'R2Reasoning',
#     'context': {'user_query': 'Why was my ad rejected?'}, 'inputs': {'policy_docs': '[docs]', 'ad_content': '[content]'},
#     'decision': {'status': 'rejected', 'reason_code': 'POLICY_VIOLATION'}, 'outputs': {'raw_llm_output': '...'},
#     'chain_of_thought': [
#         "Step 1: Identify relevant policy sections based on ad content.",
#         "Step 2: Compare ad content against policy criteria.",
#         "Step 3: Found violation regarding 'prohibited claims'.",
#         "Step 4: Formulate rejection reason based on violation."
#     ],
#     'model_used': 'llama4-maverick'
# }
# xai.record_decision(**decision_details)

# # 2. Request explanations
# cot_explanation = xai.explain_decision('example_decision_id_cot', method='chain_of_thought')
# print("--- CoT Explanation ---")
# print(cot_explanation)

# counterfactual_explanation = xai.explain_decision('example_decision_id_cot', method='counterfactual')
# print("\n--- Counterfactual Explanation ---")
# print(counterfactual_explanation)

# shap_explanation = xai.explain_decision('example_decision_id_cot', method='shap')
# print("\n--- SHAP Explanation ---")
# print(shap_explanation) # Will likely show the "not implemented" message

# role_based_explanation = xai.provide_role_based_explanation('example_decision_id_cot', role='Business User', method='chain_of_thought')
# print("\n--- Role-Based Explanation (Business User) ---")
# print(role_based_explanation)

In [43]:
# Cell 15: MIZ MoA System (Replaces Agent Orchestrator - AW Pillar) - Integrated
# Status: Major Refactor Complete. Integrates MoA layered structure, UnifiedCommunicationSystem, REWOO planning (via BossAgent), EnhancedBaseAgent. Leverages RobustTaskQueue/TaskPersistenceManager abstractions within communication/state management. Removes ThreadPoolExecutor.
# OKI Requirements: Production-ready queue/persistence backend (placeholders used by comms). Asynchronous processing via asyncio/messaging. Integration with MoA layered structure and communication patterns. Task pausing/resuming handled via task state/messaging.
# Reasoning: This cell now embodies the MoA architecture. It manages layers of agents, uses a dedicated communication system (which leverages the robust queue/persistence), and delegates planning to the BossAgent/REWOO. Task execution is inherently asynchronous through the message-driven agent interactions. Pause/resume logic interacts with the external task persistence layer.

import logging
import datetime
import uuid
import time
import random
from collections import deque, defaultdict, Counter
import asyncio # Core for MoA execution
from typing import Dict, Any, Optional, List, Union, Type, Callable # Added Type, Callable
import json
import os
from abc import ABC, abstractmethod
import heapq # For potential priority queue implementation
import traceback # For error handling
from enum import Enum, auto # Added for MessageType
from dataclasses import dataclass, field # Added for AgentMessage
import nest_asyncio # Added for notebook compatibility

# Apply nest_asyncio early for notebook environments
nest_asyncio.apply()

# --- Placeholder Dependencies (Assume these exist or are implemented elsewhere) ---
# Import necessary classes from other cells if they are defined there
# Example: from cell1 import EnhancedConfig, CONFIG
# Example: from cell3 import EnhancedSelfHealingKG
# Example: from cell7 import HyperdimensionalPersonalization # Needed for ChurnAgent example

# --- Define Placeholder Classes if Dependencies Not Available ---
class PlaceholderKG:
    def get_entity(self, *args, **kwargs): logger.debug(f"PlaceholderKG.get_entity called"); return {"type": "placeholder", "mizId": args[0], "name": "Placeholder"}
    def add_relationship(self, *args, **kwargs): logger.debug(f"PlaceholderKG.add_relationship called"); return True
class PlaceholderPersonalization:
    def get_personalized_recommendations(self, *args, **kwargs): logger.debug(f"PlaceholderPersonalization.get_personalized_recommendations called"); return ["OFFER_DEFAULT"]
class PlaceholderConfig(dict): # Simple dict-like config
     def get(self, key, default=None):
          # Simulate nested access for specific keys used in init
          if key == "task_queue_type": return self.get("orchestrator", {}).get("queue_type", default or "memory")
          if key == "task_persistence_type": return self.get("orchestrator", {}).get("persistence_type", default or "file")
          if key == "task_persistence_filepath": return self.get("orchestrator", {}).get("persistence_filepath", default or "miz3_moa_state.json")
          if key == "moa_layer_configs": return self.get("moa", {}).get("layer_configs", default or {})
          return super().get(key, default)

# --- End Placeholder Dependencies ---

logger = logging.getLogger('MIZ-OKI.MoASystem')

# --- Robust Queue/Persistence Interfaces & Placeholders (Adapted from previous Cell 15) ---
# Using asyncio for async operations

class RobustTaskQueue(ABC):
    @abstractmethod
    async def enqueue(self, task: Dict): pass
    @abstractmethod
    async def dequeue(self, timeout: Optional[float] = None) -> Optional[Dict]: pass
    @abstractmethod
    async def task_done(self, task_id: str): pass
    @abstractmethod
    async def requeue_on_failure(self, task: Dict, delay_seconds: float = 0): pass
    @abstractmethod
    async def get_pending_count(self) -> int: pass
    @abstractmethod
    async def connect(self): pass
    @abstractmethod
    async def close(self): pass

class MemoryQueue(RobustTaskQueue):
    """Async In-memory queue (non-persistent, for testing/notebook)."""
    def __init__(self, config):
        self.queue = asyncio.Queue()
        self.logger = logging.getLogger('MIZ-OKI.MemoryQueue')

    async def connect(self):
        self.logger.warning("Using MemoryQueue - suitable only for testing, not production.")
        self.logger.info("MemoryQueue connected (in-memory).")

    async def enqueue(self, task: Dict):
        await self.queue.put(task)
        self.logger.debug(f"Task {task.get('id', 'N/A')} enqueued.") # Use message ID

    async def dequeue(self, timeout: Optional[float] = None) -> Optional[Dict]:
        try:
            task = await asyncio.wait_for(self.queue.get(), timeout=timeout)
            self.logger.debug(f"Task {task.get('id', 'N/A')} dequeued.")
            return task
        except asyncio.TimeoutError:
            return None

    async def task_done(self, task_id: str):
        # In asyncio.Queue, task_done is called on the queue object after processing
        # Here, we just log as the dequeue consumer handles the ack implicitly
        self.logger.debug(f"Task {task_id} marked as done (memory queue).")

    async def requeue_on_failure(self, task: Dict, delay_seconds: float = 0):
        self.logger.warning(f"Re-queueing task {task.get('id', 'N/A')} after failure (delay: {delay_seconds}s).")
        if delay_seconds > 0:
            await asyncio.sleep(delay_seconds)
        await self.enqueue(task)

    async def get_pending_count(self) -> int:
        return self.queue.qsize()

    async def close(self):
        self.logger.info("MemoryQueue closed.")

class TaskPersistenceManager(ABC):
    @abstractmethod
    async def save_task(self, task: Dict): pass
    @abstractmethod
    async def load_task(self, task_id: str) -> Optional[Dict]: pass
    @abstractmethod
    async def load_all_tasks(self, status_filter: Optional[str] = None) -> List[Dict]: pass
    @abstractmethod
    async def delete_task(self, task_id: str): pass
    @abstractmethod
    async def save_dlq_task(self, task: Dict, reason: str): pass
    @abstractmethod
    async def connect(self): pass
    @abstractmethod
    async def close(self): pass

class FilePersistenceManager(TaskPersistenceManager):
    """Async file-based persistence (for testing/notebook)."""
    def __init__(self, config):
        self.filepath = config.get("task_persistence_filepath", "miz3_moa_state.json")
        self.dlq_filepath = self.filepath.replace(".json", "_dlq.jsonl")
        self.lock = asyncio.Lock() # Use asyncio lock
        self.logger = logging.getLogger('MIZ-OKI.FilePersistence')

    async def connect(self):
        try:
            os.makedirs(os.path.dirname(self.filepath), exist_ok=True)
            self.logger.warning("Using FilePersistenceManager - suitable only for testing, not production.")
            self.logger.info(f"FilePersistenceManager initialized. State file: {self.filepath}")
        except Exception as e:
            self.logger.error(f"Failed to create directory for persistence file {self.filepath}: {e}")

    async def _load_state(self) -> Dict[str, Dict]:
        async with self.lock:
            if os.path.exists(self.filepath):
                try:
                    # Use asyncio.to_thread for sync file I/O
                    content = await asyncio.to_thread(self._read_file_sync, self.filepath)
                    if not content: return {}
                    return json.loads(content)
                except (json.JSONDecodeError, IOError) as e:
                    self.logger.error(f"Error loading state file {self.filepath}: {e}. Starting fresh.")
                    return {}
            return {}

    def _read_file_sync(self, path):
        """Synchronous helper for reading file."""
        with open(path, 'r') as f:
            return f.read()

    async def _save_state(self, state: Dict[str, Dict]):
        async with self.lock:
            try:
                # Use asyncio.to_thread for sync file I/O
                await asyncio.to_thread(self._write_file_sync, self.filepath, state)
            except IOError as e:
                self.logger.error(f"Error saving state file {self.filepath}: {e}")
                # Backup/restore logic omitted for brevity in async version

    def _write_file_sync(self, path, state):
        """Synchronous helper for writing file."""
        # Basic write, backup logic omitted for async simplicity
        with open(path, 'w') as f:
            json.dump(state, f, indent=2, default=str)

    async def save_task(self, task: Dict):
        state = await self._load_state()
        # Use external task ID ('task_id') or message ID ('id')
        task_key = task.get("task_id") or task.get("id")
        if not task_key:
             self.logger.error("Task dictionary missing 'task_id' or 'id'. Cannot save.")
             return
        state[task_key] = task
        await self._save_state(state)
        self.logger.debug(f"Task {task_key} saved to persistence.")

    async def load_task(self, task_id: str) -> Optional[Dict]:
        state = await self._load_state()
        return state.get(task_id)

    async def load_all_tasks(self, status_filter: Optional[str] = None) -> List[Dict]:
        state = await self._load_state()
        tasks = list(state.values())
        if status_filter:
            tasks = [t for t in tasks if t.get("status") == status_filter]
        return tasks

    async def delete_task(self, task_id: str):
        state = await self._load_state()
        if task_id in state:
            del state[task_id]
            await self._save_state(state)
            self.logger.debug(f"Task {task_id} deleted from persistence.")

    async def save_dlq_task(self, task: Dict, reason: str):
        dlq_entry = {
            "timestamp": datetime.datetime.now().isoformat(),
            "task_id": task.get("task_id") or task.get("id"),
            "task_data": task,
            "reason": reason
        }
        async with self.lock:
            try:
                # Use asyncio.to_thread for sync file I/O
                await asyncio.to_thread(self._append_dlq_sync, self.dlq_filepath, dlq_entry)
                self.logger.warning(f"Task {dlq_entry['task_id']} saved to DLQ: {self.dlq_filepath}")
            except IOError as e:
                self.logger.error(f"Error saving task to DLQ file {self.dlq_filepath}: {e}")

    def _append_dlq_sync(self, path, entry):
        """Synchronous helper for appending to DLQ."""
        with open(path, 'a') as f:
            f.write(json.dumps(entry, default=str) + '\n')

    async def close(self):
        self.logger.info("FilePersistenceManager closed.")

# --- MoA Core Class Definitions (Adapted/Simplified from code_cell6_x) ---
class MessageType(Enum):
    TASK_ASSIGNMENT = auto()
    TASK_RESULT = auto()
    CONTEXT_UPDATE = auto()
    ERROR_NOTIFICATION = auto()
    RESOURCE_REQUEST = auto()
    COLLABORATION_REQUEST = auto()
    CONTROL_COMMAND = auto() # Added for pause/resume

@dataclass
class AgentMessage:
    id: str = field(default_factory=lambda: str(uuid.uuid4()))
    sender: str = 'system'
    receiver: str = '' # Can be agent ID or 'broadcast' or 'layer_X'
    message_type: MessageType = MessageType.TASK_ASSIGNMENT
    content: Dict[str, Any] = field(default_factory=dict)
    timestamp: datetime.datetime = field(default_factory=datetime.datetime.now) # Use datetime.datetime
    context: Dict[str, Any] = field(default_factory=dict)
    priority: int = 5
    trace_id: Optional[str] = None

    def to_dict(self) -> Dict[str, Any]:
        return {
            'id': self.id, 'sender': self.sender, 'receiver': self.receiver,
            'message_type': self.message_type.name, 'content': self.content,
            'timestamp': self.timestamp.isoformat(), 'context': self.context,
            'priority': self.priority, 'trace_id': self.trace_id
        }

    @classmethod
    def from_dict(cls, data: Dict[str, Any]) -> 'AgentMessage':
        # Handle potential string timestamp
        ts_str = data.get('timestamp')
        timestamp = datetime.datetime.fromisoformat(ts_str) if isinstance(ts_str, str) else datetime.datetime.now()

        return cls(
            id=data.get('id', str(uuid.uuid4())), sender=data.get('sender'), receiver=data.get('receiver'),
            message_type=MessageType[data.get('message_type', 'TASK_ASSIGNMENT')],
            content=data.get('content', {}), timestamp=timestamp,
            context=data.get('context', {}), priority=data.get('priority', 5), trace_id=data.get('trace_id')
        )

class UnifiedCommunicationSystem:
    """Handles message routing between agents using RobustTaskQueue."""
    def __init__(self, config: Dict, queue_impl: RobustTaskQueue):
        self.config = config
        self.persistent_queue = queue_impl # Use the robust queue
        self.logger = logging.getLogger('MIZ-OKI.CommunicationSystem')
        self.agent_registry_ref: Optional[Dict[str, Any]] = None # Will be set by MoA system
        self._receive_tasks: Dict[str, asyncio.Task] = {} # Store background receive tasks

    async def initialize(self, agent_registry: Dict):
        self.agent_registry_ref = agent_registry
        await self.persistent_queue.connect()
        # Start background listener for the persistent queue
        self._listener_task = asyncio.create_task(self._persistent_queue_listener())
        self.logger.info("Unified Communication System initialized and listener started.")

    async def _persistent_queue_listener(self):
        """Listens to the persistent queue and routes messages to agent internal queues."""
        self.logger.info("Persistent queue listener started.")
        while True:
            try:
                # Dequeue messages intended for *any* agent or broadcast
                # This assumes the queue implementation allows broad dequeuing or uses subscriptions.
                # For simple queues, this might just pull the next message.
                task_dict = await self.persistent_queue.dequeue(timeout=5.0)
                if task_dict:
                    try:
                        message = AgentMessage.from_dict(task_dict)
                        receiver = message.receiver
                        agent_instance = self.agent_registry_ref.get(receiver)

                        if agent_instance and hasattr(agent_instance, 'message_queue'):
                            await agent_instance.message_queue.put(message)
                            self.logger.debug(f"Routed message {message.id} to agent {receiver}'s internal queue.")
                            # Acknowledge message processed by the router
                            # Task done might need message ID or specific handle from queue
                            await self.persistent_queue.task_done(task_dict.get('id') or task_dict.get('task_id'))
                        elif receiver == 'broadcast':
                             self.logger.debug(f"Broadcasting message {message.id}...")
                             for agent_id, agent in self.agent_registry_ref.items():
                                  if hasattr(agent, 'message_queue'):
                                       await agent.message_queue.put(message)
                             await self.persistent_queue.task_done(task_dict.get('id') or task_dict.get('task_id'))
                        else:
                            self.logger.warning(f"Receiver '{receiver}' not found or has no queue for message {message.id}. Requeueing.")
                            # Requeue with a delay if receiver not found yet (might be starting)
                            await self.persistent_queue.requeue_on_failure(task_dict, delay_seconds=5)

                    except (KeyError, ValueError, TypeError) as msg_e:
                         self.logger.error(f"Failed to parse or route message from queue: {msg_e}. Message data: {task_dict}")
                         # Decide how to handle bad messages (DLQ?)
                         # Ack the bad message to prevent requeue loop
                         await self.persistent_queue.task_done(task_dict.get('id') or task_dict.get('task_id'))
                    except Exception as route_e:
                         self.logger.error(f"Error routing message: {route_e}", exc_info=True)
                         # Requeue on unexpected routing errors
                         await self.persistent_queue.requeue_on_failure(task_dict, delay_seconds=10)

                else:
                    # No message, continue listening
                    pass
            except asyncio.CancelledError:
                self.logger.info("Persistent queue listener cancelled.")
                break
            except Exception as e:
                self.logger.error(f"Error in persistent queue listener: {e}", exc_info=True)
                await asyncio.sleep(5) # Avoid rapid failure loops

    async def send_message(self, message: AgentMessage):
        """Sends a message via the persistent queue."""
        self.logger.debug(f"Sending message {message.id} from {message.sender} to {message.receiver} ({message.message_type.name}) via persistent queue.")
        try:
            await self.persistent_queue.enqueue(message.to_dict())
        except Exception as e:
            self.logger.error(f"Failed to enqueue message {message.id}: {e}")
            # Handle enqueue failure (e.g., retry, DLQ)

    async def receive_message_for_agent(self, agent_id: str, timeout: Optional[float] = 5.0) -> Optional[AgentMessage]:
        """Receives a message from the agent's internal queue."""
        agent_instance = self.agent_registry_ref.get(agent_id)
        if not agent_instance or not hasattr(agent_instance, 'message_queue'):
            self.logger.error(f"Agent {agent_id} not found or has no message queue.")
            return None
        try:
            message = await asyncio.wait_for(agent_instance.message_queue.get(), timeout=timeout)
            agent_instance.message_queue.task_done() # Ack internal queue
            self.logger.debug(f"Agent {agent_id} received message {message.id} from internal queue.")
            return message
        except asyncio.TimeoutError:
            return None
        except Exception as e:
            self.logger.error(f"Error receiving message for {agent_id} from internal queue: {e}")
            return None

    async def cleanup(self):
        if hasattr(self, '_listener_task') and self._listener_task:
            self._listener_task.cancel()
            try:
                await self._listener_task
            except asyncio.CancelledError:
                pass # Expected
        await self.persistent_queue.close()
        self.logger.info("Unified Communication System cleaned up.")

class EnhancedBaseAgent(ABC):
    """Enhanced Base Agent with Async Communication and Lifecycle."""
    def __init__(self, agent_id: str, config: Dict, communication_system: UnifiedCommunicationSystem, knowledge_graph: Any, capabilities: List[str] = None):
        self.agent_id = agent_id
        self.config = config
        self.communication = communication_system
        self.kg = knowledge_graph
        self.capabilities = capabilities or []
        self._status = "initializing"
        self._current_task_id = None
        self._task_context = {}
        self._shutdown_event = asyncio.Event()
        self.logger = logging.getLogger(f'MIZ-OKI.Agent.{agent_id}')
        self.message_queue = asyncio.Queue() # Internal queue for received messages

    async def initialize(self):
        """Initialize agent resources."""
        self.logger.info(f"Initializing...")
        await self._load_state() # Load persistent state if any
        self._status = "idle"
        self.logger.info(f"Initialized successfully.")

    async def _load_state(self):
        self.logger.debug("Loading agent state (placeholder).")
        pass

    async def _save_state(self):
        self.logger.debug("Saving agent state (placeholder).")
        pass

    async def run(self):
        """Main agent loop to process messages from internal queue."""
        self.logger.info("Starting agent run loop.")
        while not self._shutdown_event.is_set():
            try:
                message = await self.message_queue.get()
                await self.handle_message(message)
                self.message_queue.task_done() # Ack internal queue
            except asyncio.CancelledError:
                self.logger.info("Run loop cancelled.")
                break
            except Exception as e:
                self.logger.error(f"Error in agent run loop: {e}", exc_info=True)
                # Avoid rapid failure loops if queue is constantly failing
                await asyncio.sleep(1)
        self.logger.info("Agent run loop stopped.")

    async def handle_message(self, message: AgentMessage):
        """Handles incoming messages."""
        self.logger.debug(f"Received message {message.id} type {message.message_type.name} from {message.sender}")
        self._current_task_id = message.trace_id or message.id
        self._task_context = message.context
        start_time = time.time()

        try:
            if message.message_type == MessageType.TASK_ASSIGNMENT:
                task_type = message.content.get('task_type')
                if task_type in self.capabilities:
                    self._status = "running"
                    self.logger.info(f"Starting task {self._current_task_id} ({task_type}).")
                    result = await self.process_task(task_type, message.content) # Pass task_type
                    duration = time.time() - start_time
                    self.logger.info(f"Task {self._current_task_id} completed successfully in {duration:.2f}s.")
                    await self.send_response(message.sender, MessageType.TASK_RESULT, {"result": result, "duration": duration}, message.id)
                else:
                    await self.send_error(message.sender, f"Capability '{task_type}' not supported.", message.id)
            elif message.message_type == MessageType.CONTEXT_UPDATE:
                await self.update_internal_context(message.content)
            elif message.message_type == MessageType.CONTROL_COMMAND:
                 await self.handle_control_command(message.content)
            else:
                self.logger.warning(f"Unhandled message type: {message.message_type.name}")

        except NotImplementedError as nie:
             duration = time.time() - start_time
             self.logger.error(f"Task {self._current_task_id} failed: {nie}.")
             await self.send_error(message.sender, f"Task type not implemented: {nie}", message.id)
        except Exception as e:
            duration = time.time() - start_time
            self.logger.error(f"Error handling message {message.id} after {duration:.2f}s: {e}", exc_info=True)
            await self.send_error(message.sender, f"Internal agent error: {e}", message.id)
        finally:
            self._status = "idle"
            self._current_task_id = None
            self._task_context = {}

    @abstractmethod
    async def process_task(self, task_type: str, task_details: Dict[str, Any]) -> Any:
        """Core logic for processing a task. Must be overridden."""
        pass

    async def update_internal_context(self, context_update: Dict[str, Any]):
        self.logger.debug(f"Updating internal context: {context_update}")
        pass

    async def handle_control_command(self, command_details: Dict[str, Any]):
        command = command_details.get("command")
        self.logger.info(f"Received control command: {command}")
        if command == "pause":
             self._status = "paused"
        elif command == "resume":
             if self._status == "paused":
                  self._status = "idle"
        elif command == "shutdown":
             await self.cleanup()
        else:
             self.logger.warning(f"Unknown control command: {command}")

    async def send_message(self, receiver: str, msg_type: MessageType, content: Dict, priority: int = 5, context: Optional[Dict] = None):
        if not self.communication: return
        message = AgentMessage(
            sender=self.agent_id, receiver=receiver, message_type=msg_type,
            content=content, priority=priority, trace_id=self._current_task_id,
            context={**(self._task_context or {}), **(context or {})}
        )
        await self.communication.send_message(message)

    async def send_response(self, receiver: str, msg_type: MessageType, result_content: Any, original_msg_id: str):
        await self.send_message(receiver, msg_type, {"result": result_content, "original_message_id": original_msg_id})

    async def send_error(self, receiver: str, error_message: str, original_msg_id: Optional[str] = None):
        content = {"error": error_message}
        if original_msg_id: content["original_message_id"] = original_msg_id
        await self.send_message(receiver, MessageType.ERROR_NOTIFICATION, content, priority=10)

    async def cleanup(self):
        self.logger.info("Shutting down...")
        self._shutdown_event.set()
        await self._save_state()
        self.logger.info("Cleanup complete.")

    def get_status(self) -> Dict[str, Any]:
        return {"agent_id": self.agent_id, "status": self._status, "current_task": self._current_task_id}

# --- REWOO Planning System (Placeholder - Needs Full Implementation) ---
class REWOOSystem:
    """Placeholder for REWOO planning logic."""
    def __init__(self, config):
        self.config = config
        self.logger = logging.getLogger('MIZ-OKI.REWOO')

    async def initialize(self):
        self.logger.info("REWOO Planning System initialized (Placeholder).")

    async def create_plan(self, task_details: Dict[str, Any]) -> Dict[str, Any]:
        self.logger.info(f"REWOO: Creating plan for task: {str(task_details)[:100]}...")
        # MIZ 3.0 TODO: Implement actual REWOO planning logic (Observe, Plan, Execute cycle simulation)
        # Placeholder plan:
        plan = {
            "plan_id": f"plan_{uuid.uuid4()}",
            "steps": [
                {"step_id": 1, "layer": 1, "agent_type": "DataProcessingAgent", "task_type": "ingest_data", "details": task_details.get("data_sources")},
                {"step_id": 2, "layer": 2, "agent_type": "AnalysisAgent", "task_type": "analyze_trends", "depends_on": 1},
                {"step_id": 3, "layer": 3, "agent_type": "KnowledgeGraphAgent", "task_type": "update_entity", "depends_on": 2},
                {"step_id": 4, "layer": 4, "agent_type": "ActionAgent", "task_type": "generate_report", "depends_on": 3},
            ],
            "task_details": task_details
        }
        self.logger.info(f"REWOO: Generated plan {plan['plan_id']}")
        return plan

# --- Agent Factory (Adapted) ---
class AgentFactory:
    def __init__(self, config: Dict, communication_system: UnifiedCommunicationSystem, knowledge_graph: Any):
        self.config = config
        self.communication = communication_system
        self.kg = knowledge_graph
        # MIZ 3.0: Ensure all required agent classes are defined and imported
        self.agent_classes: Dict[str, Type[EnhancedBaseAgent]] = {
            "BossAgent": BossAgent,
            "DataProcessingAgent": DataProcessingAgent,
            "AnalysisAgent": AnalysisAgent,
            "KnowledgeGraphAgent": KnowledgeGraphAgent,
            "ActionAgent": ActionAgent,
            "ChurnRescueAgent": ChurnRescueAgent,
            "PlaceholderAgent": PlaceholderAgent,
            "CommunicationAgent": PlaceholderAgent, # Use placeholder for now
        }
        self.logger = logging.getLogger('MIZ-OKI.AgentFactory')

    async def create_agent(self, agent_type: str, agent_id: str, capabilities: List[str]) -> Optional[EnhancedBaseAgent]:
        if agent_type not in self.agent_classes:
            self.logger.error(f"Unknown agent type requested: {agent_type}")
            return None
        try:
            agent_class = self.agent_classes[agent_type]
            # Inject dependencies - Add specific ones if needed
            agent_dependencies = {
                "agent_id": agent_id,
                "config": self.config,
                "communication_system": self.communication,
                "knowledge_graph": self.kg,
                "capabilities": capabilities
            }
            # Add specific dependencies for certain agents
            if agent_type == "ChurnRescueAgent":
                 # Assuming personalization engine is accessible, e.g., via config or passed to factory
                 _hyperpersonalization = hyperpersonalization if 'hyperpersonalization' in globals() else PlaceholderPersonalization()
                 agent_dependencies["personalization_engine"] = _hyperpersonalization

            agent = agent_class(**agent_dependencies)
            await agent.initialize()
            return agent
        except Exception as e:
            self.logger.error(f"Failed to create or initialize agent {agent_id} (Type: {agent_type}): {e}", exc_info=True)
            return None

# --- Define Agent Classes (Placeholders/Examples Inheriting EnhancedBaseAgent) ---
class BossAgent(EnhancedBaseAgent):
    """Coordinates tasks across layers, potentially using REWOO."""
    def __init__(self, agent_id: str, config: Dict, communication_system: UnifiedCommunicationSystem, knowledge_graph: Any, capabilities: List[str] = None):
        super().__init__(agent_id, config, communication_system, knowledge_graph, capabilities or ["plan_workflow", "coordinate_layers", "process_external_task"])
        self.planner = REWOOSystem(config) # Integrate REWOO

    async def initialize(self):
        await super().initialize()
        await self.planner.initialize()
        self.logger.info("BossAgent specific initialization complete.")

    async def process_task(self, task_type: str, task_details: Dict[str, Any]) -> Any:
        if task_type == "process_external_task":
            self.logger.info(f"Received external task: {str(task_details)[:100]}... Planning execution.")
            plan = await self.planner.create_plan(task_details)
            self.logger.info(f"Generated plan: {plan.get('plan_id')}")

            # Dispatch first step(s) based on plan dependencies
            if plan and plan.get("steps"):
                # MIZ 3.0 TODO: Implement dependency handling
                first_step = plan["steps"][0] # Simple case: dispatch first step
                target_layer = first_step.get("layer", 1)
                target_agent_type = first_step.get("agent_type")
                # Find an agent of the target type in the target layer
                target_agent_id = None
                # Need access to the MoA system's layer structure here, or send to layer broadcast
                # For simplicity, send to layer broadcast for now
                receiver = f"layer_{target_layer}"

                await self.send_message(
                    receiver=receiver,
                    msg_type=MessageType.TASK_ASSIGNMENT,
                    content=first_step, # Send step details
                    context={"plan": plan, "full_task": task_details} # Pass context
                )
                return {"status": "plan_dispatched", "plan_id": plan.get("plan_id")}
            else:
                 self.logger.error("Planning failed or produced empty plan.")
                 return {"status": "planning_failed"}
        else:
             raise NotImplementedError(f"BossAgent does not support task type: {task_type}")

    # MIZ 3.0 TODO: Implement handling of results from layers, error handling, replanning

class DataProcessingAgent(EnhancedBaseAgent):
     async def process_task(self, task_type: str, task_details: Dict[str, Any]) -> Any:
          self.logger.info(f"Layer 1: Processing data task '{task_type}': {str(task_details)[:100]}...")
          await asyncio.sleep(random.uniform(0.5, 1.5))
          processed_data = {"processed": True, "input": task_details, "layer1_output": f"data_{uuid.uuid4()}"}
          # Send result to next layer (Layer 2 Analysis)
          await self.send_message("layer_2", MessageType.TASK_ASSIGNMENT, {"task_type": "analyze_data", **processed_data})
          return processed_data

class AnalysisAgent(EnhancedBaseAgent):
     async def process_task(self, task_type: str, task_details: Dict[str, Any]) -> Any:
          self.logger.info(f"Layer 2: Analyzing data task '{task_type}': {str(task_details)[:100]}...")
          await asyncio.sleep(random.uniform(0.5, 2.0))
          analysis = {"insights": ["Insight A", "Insight B"], "source_data": task_details, "layer2_output": f"analysis_{uuid.uuid4()}"}
          await self.send_message("layer_3", MessageType.TASK_ASSIGNMENT, {"task_type": "update_kg", **analysis})
          return analysis

class KnowledgeGraphAgent(EnhancedBaseAgent):
     async def process_task(self, task_type: str, task_details: Dict[str, Any]) -> Any:
          self.logger.info(f"Layer 3: Updating KG task '{task_type}': {str(task_details)[:100]}...")
          await asyncio.sleep(random.uniform(0.2, 0.8))
          kg_update_status = {"kg_updated": True, "nodes_affected": random.randint(1, 5), "layer3_output": f"kg_update_{uuid.uuid4()}"}
          await self.send_message("layer_4", MessageType.TASK_ASSIGNMENT, {"task_type": "perform_action", **kg_update_status})
          return kg_update_status

class ActionAgent(EnhancedBaseAgent):
     async def process_task(self, task_type: str, task_details: Dict[str, Any]) -> Any:
          self.logger.info(f"Layer 4: Performing action task '{task_type}': {str(task_details)[:100]}...")
          await asyncio.sleep(random.uniform(0.3, 1.0))
          action_result = {"action_status": "completed", "external_id": f"ext_{random.randint(1000,9999)}"}
          # Send final result back to BossAgent
          await self.send_message("BossAgent_0_uuid", MessageType.TASK_RESULT, action_result) # Assuming BossAgent ID format
          return action_result

class PlaceholderAgent(EnhancedBaseAgent):
     async def process_task(self, task_type: str, task_details: Dict[str, Any]) -> Any:
          self.logger.info(f"Agent {self.agent_id}: Executing placeholder task '{task_type}'...")
          await asyncio.sleep(random.uniform(0.1, 0.5))
          return f"Placeholder result for {task_type}"

# --- Update ChurnRescueAgent ---
class ChurnRescueAgent(EnhancedBaseAgent):
    """Example agent to handle potential churn risks (Updated for MoA)."""
    def __init__(self, agent_id: str, config: Dict, communication_system: UnifiedCommunicationSystem, knowledge_graph: Any, personalization_engine: Any, capabilities: List[str] = None):
        super().__init__(agent_id, config, communication_system, knowledge_graph, capabilities or ["churn_intervention", "loyalty_offer"])
        self.personalization = personalization_engine
        if not self.personalization: self.logger.warning("Personalization engine not provided.")

    async def process_task(self, task_type: str, task_details: Dict[str, Any]) -> Any:
        if task_type == "churn_intervention":
            customer_id = task_details.get("customer_id")
            churn_probability = task_details.get("churn_probability", 0.0)
            if not customer_id: raise ValueError("Missing 'customer_id'")
            self.logger.info(f"Processing churn intervention for {customer_id} (Prob: {churn_probability:.2f})")

            customer_entity = await asyncio.to_thread(self.kg.get_entity, customer_id)
            if not customer_entity: return f"Customer {customer_id} not found."

            offer = "Default Retention Offer (10% off)"
            if self.personalization:
                 try:
                      catalog = [{"item_id": "OFFER_DISCOUNT_15"}, {"item_id": "OFFER_POINTS_500"}]
                      recs = await asyncio.to_thread(
                           self.personalization.get_personalized_recommendations,
                           customer_id, catalog, n=1, context={"intent": "churn_rescue"}
                      )
                      if recs: offer = f"Personalized Offer ID: {recs[0]}"
                 except Exception as e: self.logger.error(f"Personalization failed: {e}")

            comm_data = {"customer_id": customer_id, "channel": "email", "offer_details": offer}
            # Find a communication agent ID (assuming one exists)
            comm_agent_id = next((aid for aid, agent in self.communication.agent_registry_ref.items() if "CommunicationAgent" in aid), None)
            if comm_agent_id:
                 await self.send_message(comm_agent_id, MessageType.TASK_ASSIGNMENT, {"task_type": "send_personalized_message", **comm_data})
            else:
                 self.logger.warning("No CommunicationAgent found to send message.")

            try:
                await asyncio.to_thread(
                    self.kg.add_relationship,
                    {
                        "source_hints": {"type": "agent", "agent_id": self.agent_id},
                        "target_hints": {"type": "customer", "mizId": customer_id},
                        "type": "performed_intervention",
                        "offer": offer,
                        "timestamp": datetime.datetime.now().isoformat()
                    }
                )
            except Exception as kg_e: self.logger.error(f"Failed to log intervention in KG: {kg_e}")

            return f"Churn intervention initiated for {customer_id}. Offer: {offer}."
        else:
            raise NotImplementedError(f"Unsupported task type: {task_type}")

# --- MIZ MoA System (Replaces Orchestrator) ---
class MIZ_MoA_System:
    """
    Manages the Mixture-of-Agents system lifecycle, task intake, and overall coordination.
    Replaces the previous AgentOrchestrator logic.
    """
    def __init__(self, config: Dict, knowledge_graph: Any):
        self.config = config
        self.kg = knowledge_graph
        self.agents: Dict[str, EnhancedBaseAgent] = {}
        self.layer_agents: Dict[int, List[EnhancedBaseAgent]] = defaultdict(list)
        self.task_persistence: Optional[TaskPersistenceManager] = None
        self.communication_system: Optional[UnifiedCommunicationSystem] = None
        self.agent_factory: Optional[AgentFactory] = None
        self.boss_agent: Optional[BossAgent] = None
        self._running_agent_tasks: List[asyncio.Task] = []
        self._shutdown_event = asyncio.Event()
        self.logger = logging.getLogger('MIZ-OKI.MoASystem')
        self.initialized = False

    async def initialize(self):
        if self.initialized: return
        self.logger.info("Initializing MIZ MoA System...")
        try:
            queue_impl = self._init_queue(self.config)
            self.task_persistence = self._init_persistence(self.config)
            await self.task_persistence.connect()

            self.communication_system = UnifiedCommunicationSystem(self.config, queue_impl)
            self.agent_factory = AgentFactory(self.config, self.communication_system, self.kg)

            layer_configs = self.config.get("moa_layer_configs", {})
            if not layer_configs: self.logger.warning("No 'moa_layer_configs' found in config. No agents will be created.")

            for layer_id_str, layer_config in layer_configs.items():
                try: layer_id = int(layer_id_str)
                except ValueError: self.logger.error(f"Invalid layer ID '{layer_id_str}'. Skipping."); continue

                agent_types = layer_config.get("agents", [])
                for agent_type in agent_types:
                    # Create unique ID
                    agent_id = f"{agent_type}_{layer_id}_{uuid.uuid4().hex[:4]}"
                    capabilities = self._get_capabilities_for_agent(agent_type)
                    agent = await self.agent_factory.create_agent(agent_type, agent_id, capabilities)
                    if agent:
                        self.agents[agent_id] = agent
                        self.layer_agents[layer_id].append(agent)
                        if agent_type == "BossAgent":
                            if self.boss_agent: self.logger.warning("Multiple BossAgents defined, using the last one.")
                            self.boss_agent = agent
                    else:
                        self.logger.error(f"Failed to create agent {agent_type} for layer {layer_id}.")

            if not self.boss_agent:
                 self.logger.error("BossAgent not defined in 'moa_layer_configs' or failed to initialize. MoA system may not function correctly.")
                 # Decide whether to raise error or continue degraded
                 # raise RuntimeError("BossAgent failed to initialize.")

            await self.communication_system.initialize(self.agents) # Pass agent registry

            for agent in self.agents.values():
                task = asyncio.create_task(agent.run(), name=f"AgentLoop_{agent.agent_id}")
                self._running_agent_tasks.append(task)

            await self._load_external_tasks()

            self.initialized = True
            self.logger.info(f"MIZ MoA System initialized successfully with {len(self.agents)} agents across {len(self.layer_agents)} layers.")

        except Exception as e:
            self.logger.error(f"MIZ MoA System initialization failed: {e}", exc_info=True)
            await self.cleanup()
            raise

    def _init_queue(self, config_dict):
        queue_type = config_dict.get("task_queue_type", "memory")
        if queue_type == "memory": return MemoryQueue(config_dict)
        else: self.logger.warning(f"Unsupported queue type '{queue_type}', using MemoryQueue."); return MemoryQueue(config_dict)

    def _init_persistence(self, config_dict):
        persistence_type = config_dict.get("task_persistence_type", "file")
        if persistence_type == "file": return FilePersistenceManager(config_dict)
        else: self.logger.warning(f"Unsupported persistence type '{persistence_type}', using FilePersistence."); return FilePersistenceManager(config_dict)

    def _get_capabilities_for_agent(self, agent_type: str) -> List[str]:
        # MIZ 3.0 TODO: Load capabilities from config
        if agent_type == "BossAgent": return ["plan_workflow", "coordinate_layers", "process_external_task"]
        if agent_type == "DataProcessingAgent": return ["ingest_data", "process_log_file"]
        if agent_type == "AnalysisAgent": return ["analyze_data", "analyze_trends", "generate_report"]
        if agent_type == "KnowledgeGraphAgent": return ["update_kg", "update_entity", "query_graph"]
        if agent_type == "ActionAgent": return ["perform_action", "send_email", "adjust_bid"]
        if agent_type == "ChurnRescueAgent": return ["churn_intervention", "loyalty_offer"]
        if agent_type == "CommunicationAgent": return ["send_personalized_message", "send_notification"]
        return ["default_capability"]

    async def _load_external_tasks(self):
        try:
            pending = await self.task_persistence.load_all_tasks(status_filter="pending")
            paused = await self.task_persistence.load_all_tasks(status_filter="paused_for_approval")
            if pending:
                 self.logger.info(f"Found {len(pending)} pending external tasks. Re-submitting to BossAgent...")
                 for task in pending:
                      # Ensure task_details exist
                      task_details = task.get('task_details') or task.get('content') # Check common keys
                      if task_details:
                           await self.process_external_task(task_details, task_id_override=task.get('task_id') or task.get('id'))
                      else:
                           self.logger.warning(f"Skipping re-submission of task {task.get('task_id') or task.get('id')}: Missing task details.")
            if paused:
                 self.logger.info(f"Loaded {len(paused)} external tasks paused for approval.")
        except Exception as e:
            self.logger.error(f"Error loading external tasks from persistence: {e}")

    async def process_external_task(self, task_details: Dict[str, Any], task_id_override: Optional[str] = None) -> Optional[str]:
        if not self.initialized:
            self.logger.error("MoA system not initialized. Cannot process external task.")
            return None
        if not self.boss_agent:
             self.logger.error("BossAgent not available. Cannot process external task.")
             return None

        task_id = task_id_override or f"ext_task_{uuid.uuid4()}"
        self.logger.info(f"Received external task {task_id}: {str(task_details)[:100]}...")

        external_task_record = {
            "task_id": task_id, "task_details": task_details, "status": "pending",
            "submitted_at": datetime.datetime.now().isoformat(), "result": None, "error": None
        }
        try:
            await self.task_persistence.save_task(external_task_record)
        except Exception as e:
             self.logger.error(f"Failed to save external task {task_id} state: {e}")
             return None

        message = AgentMessage(
            sender="external_system", receiver=self.boss_agent.agent_id,
            message_type=MessageType.TASK_ASSIGNMENT,
            # Boss agent expects task_type in content
            content={"task_type": "process_external_task", **task_details},
            trace_id=task_id
        )
        await self.communication_system.send_message(message)
        self.logger.info(f"External task {task_id} sent to BossAgent.")
        return task_id

    async def pause_task(self, task_id: str, reason: str) -> bool:
        self.logger.info(f"Requesting pause for external task {task_id}. Reason: {reason}")
        try:
            task_record = await self.task_persistence.load_task(task_id)
            if not task_record: self.logger.error(f"Task {task_id} not found."); return False
            if task_record.get('status') == 'paused_for_approval': return True

            task_record['status'] = 'paused_for_approval'
            task_record['pause_reason'] = reason
            await self.task_persistence.save_task(task_record)

            # Send control command to BossAgent
            if self.boss_agent:
                 cmd_message = AgentMessage(
                      sender="system_control", receiver=self.boss_agent.agent_id,
                      message_type=MessageType.CONTROL_COMMAND,
                      content={"command": "pause", "reason": reason}, trace_id=task_id
                 )
                 await self.communication_system.send_message(cmd_message)
            return True
        except Exception as e:
            self.logger.error(f"Error pausing task {task_id}: {e}")
            return False

    async def resume_task(self, task_id: str, approver_id: str) -> bool:
        self.logger.info(f"Requesting resume for external task {task_id}. Approved by: {approver_id}")
        try:
            task_record = await self.task_persistence.load_task(task_id)
            if not task_record or task_record.get('status') != 'paused_for_approval':
                self.logger.warning(f"Task {task_id} not found or not paused.")
                return False

            task_record['status'] = 'pending'
            task_record['resumed_at'] = datetime.datetime.now().isoformat()
            task_record['resumed_by'] = approver_id
            task_record.pop('pause_reason', None)
            await self.task_persistence.save_task(task_record)

            # Re-submit to BossAgent
            await self.process_external_task(task_record['task_details'], task_id_override=task_id)
            return True
        except Exception as e:
            self.logger.error(f"Error resuming task {task_id}: {e}")
            return False

    async def get_task_status(self, task_id: str) -> Optional[Dict]:
        try:
            return await self.task_persistence.load_task(task_id)
        except Exception as e:
            self.logger.error(f"Error loading task status for {task_id}: {e}")
            return None

    def get_agent_status(self, agent_id: Optional[str] = None) -> Optional[Union[Dict, List[Dict]]]:
        if agent_id:
            agent = self.agents.get(agent_id)
            return agent.get_status() if agent else None
        else:
            return [agent.get_status() for agent in self.agents.values()]

    async def cleanup(self):
        if not self.initialized and not self._shutdown_event.is_set(): return
        self.logger.info("Initiating MIZ MoA System shutdown...")
        self._shutdown_event.set()

        agent_cleanup_tasks = [agent.cleanup() for agent in self.agents.values() if hasattr(agent, 'cleanup')]
        if agent_cleanup_tasks:
             await asyncio.gather(*agent_cleanup_tasks, return_exceptions=True)

        for task in self._running_agent_tasks:
            if not task.done(): task.cancel()
        if self._running_agent_tasks:
             await asyncio.gather(*self._running_agent_tasks, return_exceptions=True)
        self.logger.info("Agent tasks cancelled/finished.")

        if self.communication_system: await self.communication_system.cleanup()
        if self.task_persistence: await self.task_persistence.close()

        self.agents.clear(); self.layer_agents.clear(); self._running_agent_tasks.clear()
        self.initialized = False
        self.logger.info("MIZ MoA System shut down complete.")

# --- Initialization and Execution ---
# Assume eshkg and hyperpersonalization are available globally
_eshkg = PlaceholderKG() # Use placeholder for example
_hyperpersonalization = PlaceholderPersonalization() # Use placeholder
# Create a placeholder config if CONFIG is not loaded from Cell 1
if 'CONFIG' not in globals():
    CONFIG = PlaceholderConfig({
        "task_queue_type": "memory",
        "task_persistence_type": "file",
        "task_persistence_filepath": "miz3_moa_state.json",
        "moa_layer_configs": {
            "0": {"agents": ["BossAgent"]},
            "1": {"agents": ["DataProcessingAgent"]},
            "2": {"agents": ["AnalysisAgent"]},
            "3": {"agents": ["KnowledgeGraphAgent"]},
            "4": {"agents": ["ActionAgent", "ChurnRescueAgent", "CommunicationAgent"]} # Add specific agents
        }
    })

miz_moa_system: Optional[MIZ_MoA_System] = None

async def initialize_and_run_moa():
    global miz_moa_system
    if _eshkg and CONFIG:
        try:
            miz_moa_system = MIZ_MoA_System(config=CONFIG, knowledge_graph=_eshkg)
            await miz_moa_system.initialize()

            print("--- MIZ 3.0 MoA System Initialized (Integrated) ---")
            print(f"Using Queue: {miz_moa_system.communication_system.persistent_queue.__class__.__name__}, Persistence: {miz_moa_system.task_persistence.__class__.__name__}")
            print(f"Initialized Agents: {list(miz_moa_system.agents.keys())}")
            print("----------------------------------------------------")

            print("\nSubmitting example external task to MoA system...")
            test_task_id = await miz_moa_system.process_external_task({
                "task_description": "Analyze campaign 'SummerSale24'",
                "data_sources": ["GA4", "Ads"],
                "output_format": "summary_report"
            })
            print(f"Submitted external task with ID: {test_task_id}")

            if test_task_id:
                print("\nWaiting a few seconds for task processing...")
                await asyncio.sleep(10) # Wait longer

                print(f"\nChecking status for external task {test_task_id}:")
                status = await miz_moa_system.get_task_status(test_task_id)
                print(json.dumps(status, indent=2, default=str))

                # Example Pause/Resume
                print(f"\nPausing task {test_task_id}...")
                paused = await miz_moa_system.pause_task(test_task_id, "Awaiting review")
                if paused:
                     print("Task paused. Checking status:")
                     status = await miz_moa_system.get_task_status(test_task_id)
                     print(json.dumps(status, indent=2, default=str))
                     await asyncio.sleep(2)
                     print(f"Resuming task {test_task_id}...")
                     resumed = await miz_moa_system.resume_task(test_task_id, "admin_user")
                     print(f"Task resumed: {resumed}. Checking status after short delay:")
                     await asyncio.sleep(2)
                     status = await miz_moa_system.get_task_status(test_task_id)
                     print(json.dumps(status, indent=2, default=str))


            print("\nMoA system running. Use cleanup_moa() for graceful shutdown.")

        except Exception as e:
            print(f"An unexpected error occurred during MoA System initialization or run: {e}")
            logger.error("MoA System initialization/run failed.", exc_info=True)
            if miz_moa_system: await miz_moa_system.cleanup()
    else:
        print("Error: Knowledge Graph (eshkg) or CONFIG not found. Cannot initialize MoA system.")
        logger.error("eshkg or CONFIG not found. Skipping MoA System execution.")

async def cleanup_moa():
     if miz_moa_system:
          print("Shutting down MoA system...")
          await miz_moa_system.cleanup()
          print("MoA system shut down.")

# --- Execute in Notebook ---
# Use asyncio.run() to start the main async function
# Note: If running in a Jupyter notebook, ensure nest_asyncio.apply() was called.
# If you encounter issues, you might need to run this in a separate Python script
# or manage the event loop more carefully in the notebook.

# To run:
# asyncio.run(initialize_and_run_moa())

# To cleanup later:
# asyncio.run(cleanup_moa())

In [44]:
# Cell 16: Human-Agent Collaboration Interface (API/Spec - AW Pillar) - Enhanced for MoA
# Status: Defines API structure. Backend logic uses async calls to MoA system/components with error handling. Pause/resume integration uses MoA system methods. Serialization/permission needs highlighted.
# OKI Requirements: Implement backend logic connecting to components (MoA System, HDE, AGG, CV, XAI). Handle permissions and serialization.
# Reasoning: This version updates API backend logic to interact with the async MoA system (Cell 15) for task management (pause/resume, status, assignment) and other async components (HDE, AGG, CV, XAI). It uses await for these interactions and includes appropriate error handling.

import logging
import datetime
import json
import uuid # Added
import asyncio # Added
from typing import Dict, Any, Optional, List, Union
import matplotlib.pyplot as plt
import time

# --- MoA/Orchestrator Dependency ---
# Import the new MoA system components from Cell 15
try:
    from cell15 import MIZ_MoA_System # Import the main system class
except ImportError:
    logging.warning("Could not import MoA system from Cell 15. Using placeholders.")
    class MIZ_MoA_System: # Placeholder
        async def get_task_status(self, task_id): return {"status": "unknown"}
        async def pause_task(self, task_id, reason): return False
        async def resume_task(self, task_id, approver_id): return False
        async def process_external_task(self, task_details, task_id_override=None): return None
        async def get_history(self, *args, **kwargs): return [] # Placeholder method

# --- Other Dependencies ---
# Assume these are available or use placeholders
# from cell5 import HybridDecisionEngine, AutonomousGoalGenerator, ContinuousValidation # Async versions
# from cell11 import ExplainableAI # Async version? Assume sync for now, wrap calls
# from cell1 import CONFIG # Global config

# --- Placeholder Dependencies ---
class PlaceholderHDE:
    async def get_history(self, *args, **kwargs): return []
    async def update_decision_log(self, *args, **kwargs): return True # Placeholder
class PlaceholderAGG:
    async def get_active_goals(self, *args, **kwargs): return []
    async def add_goal(self, *args, **kwargs): return f"goal_{uuid.uuid4()}"
class PlaceholderCV:
    async def submit_feedback(self, *args, **kwargs): return True # Placeholder
class PlaceholderXAI:
    def explain_decision(self, *args, **kwargs): return "Placeholder Explanation" # Sync placeholder

logger = logging.getLogger('MIZ-OKI.HumanAgentInterface')

class HumanAgentInterfaceAPI:
    """ Defines the conceptual API endpoints for human-agent collaboration (Async & MoA Integrated). """
    def __init__(self, decision_engine: Any, goal_generator: Any, validator: Any, xai: Any, moa_system: Any): # Use MoA System
        self.decision_engine = decision_engine
        self.goal_generator = goal_generator
        self.validator = validator
        self.xai = xai
        self.moa_system = moa_system # Store MoA system reference
        self.logger = logging.getLogger('MIZ-OKI.HumanAgentInterfaceAPI')
        if not all([decision_engine, goal_generator, validator, xai, moa_system]):
             self.logger.warning("One or more backend components are missing. API functionality limited.")
        self.logger.info("Human-Agent Interface API structure defined (Async & MoA Integrated).")

    # --- Decision Review & Approval (Updated for MoA Task Pause/Resume) ---
    async def get_pending_reviews(self, user_id: str, limit: int = 10) -> Dict[str, List[Dict]]:
        """ Retrieves decisions or tasks requiring human review/approval asynchronously. """
        self.logger.info(f"API Call: get_pending_reviews async for user {user_id}")
        pending = []

        # 1. Check Decision Engine History (Assume HDE methods are async or wrapped)
        if self.decision_engine and hasattr(self.decision_engine, 'get_history'):
             try:
                  history = await self.decision_engine.get_history(limit=50) # Assume async
                  review_threshold = self.decision_engine.config.get('human_review_confidence_threshold', 0.75)
                  for decision in history:
                       if not isinstance(decision, dict): continue
                       decision_id = decision.get('decision_id')
                       if not decision_id or decision.get('human_review_status'): continue
                       needs_review = (decision.get('status') == 'success' and (decision.get('final_confidence', 1.0) < review_threshold or decision.get('requires_human_approval', False)))
                       if needs_review and len(pending) < limit:
                            pending.append({"review_id": decision_id, "type": "decision_approval", "summary": f"Review decision {decision.get('decision_type', 'N/A')} (Conf: {decision.get('final_confidence', 0):.2f})", "timestamp": decision.get('timestamp_start', 'N/A'), "details_link": f"/decisions/{decision_id}"})
             except Exception as e: self.logger.error(f"Error fetching pending reviews from HDE: {e}")
        else: self.logger.warning("Decision engine unavailable/missing methods for review check.")

        # 2. Check MoA System for tasks paused for approval
        if self.moa_system and hasattr(self.moa_system, 'task_persistence') and len(pending) < limit:
            try:
                # Use persistence layer directly or add a method to MoA system
                paused_tasks = await self.moa_system.task_persistence.load_all_tasks(status_filter="paused_for_approval")
                for task in paused_tasks[-limit:]: # Get most recent paused
                    if len(pending) < limit:
                         task_id = task.get('task_id') or task.get('id')
                         if not task_id: continue
                         pending.append({"review_id": task_id, "type": "task_approval", "summary": f"Approve task '{task.get('task_details',{}).get('task_type', 'N/A')}'. Reason: {task.get('pause_reason', 'Approval')}", "timestamp": task.get('submitted_at', 'N/A'), "details_link": f"/tasks/{task_id}"})
            except Exception as e: self.logger.error(f"Error fetching paused tasks from MoA persistence: {e}")
        else: self.logger.warning("MoA system/persistence unavailable for paused task check.")

        self.logger.info(f"Found {len(pending)} pending reviews.")
        return {"pending_reviews": pending}

    async def approve_action(self, user_id: str, review_id: str, comments: Optional[str] = None) -> Dict[str, Any]:
        """ Approves a pending task or decision asynchronously. """
        self.logger.info(f"API Call: approve_action async for review {review_id} by user {user_id}")
        # MIZ 3.0 TODO: Add permission check

        # Try resuming MoA task first
        if self.moa_system and hasattr(self.moa_system, 'resume_task'):
            try:
                resumed = await self.moa_system.resume_task(review_id, approver_id=user_id)
                if resumed:
                    # MIZ 3.0 TODO: Add comments to task history/KG if needed
                    return {"status": "approved", "review_id": review_id, "message": "Task approved and resumed."}
                else: self.logger.debug(f"Task {review_id} not resumed via MoA, checking decision engine.")
            except Exception as e: self.logger.error(f"Error resuming task via MoA for {review_id}: {e}")

        # Check decision log (assuming sync update for now, wrap if needed)
        if self.decision_engine and hasattr(self.decision_engine, 'update_decision_log'):
            try:
                update_payload = {'human_review_status': 'approved', 'human_reviewer': user_id, 'human_review_comments': comments, 'human_review_timestamp': datetime.datetime.now().isoformat()}
                success = await asyncio.to_thread(self.decision_engine.update_decision_log, review_id, update_payload) # Wrap sync call
                if success:
                    self.logger.info(f"Decision {review_id} marked as approved (placeholder log update).")
                    # MIZ 3.0 TODO: Trigger subsequent action based on approved decision
                    return {"status": "approved", "review_id": review_id, "message": "Decision approved (placeholder)."}
            except Exception as e: self.logger.error(f"Error updating decision log for approval {review_id}: {e}")

        self.logger.error(f"Review item {review_id} not found or could not be approved.")
        return {"status": "error", "review_id": review_id, "message": "Approval failed."}

    async def reject_action(self, user_id: str, review_id: str, reason: str, feedback_data: Optional[Dict] = None) -> Dict[str, Any]:
        """ Rejects a pending task or decision asynchronously. """
        self.logger.info(f"API Call: reject_action async for review {review_id} by user {user_id}. Reason: {reason}")
        # MIZ 3.0 TODO: Add permission check

        rejected = False; component_id_for_feedback = None

        # Try pausing/updating MoA task status first (use pause with rejection reason)
        if self.moa_system and hasattr(self.moa_system, 'pause_task'):
            try:
                # Use pause to effectively reject/halt the task
                paused_as_rejected = await self.moa_system.pause_task(review_id, reason=f"Rejected by {user_id}: {reason}")
                if paused_as_rejected:
                    # Additionally update status in persistence if pause doesn't set 'rejected'
                    task_record = await self.moa_system.task_persistence.load_task(review_id)
                    if task_record:
                         task_record['status'] = 'rejected' # Explicitly mark as rejected
                         task_record['rejection_reason'] = reason
                         task_record['rejected_by'] = user_id
                         await self.moa_system.task_persistence.save_task(task_record)
                    self.logger.info(f"Task {review_id} marked as rejected via MoA system.")
                    component_id_for_feedback = task_record.get("target_agent_id", "unknown_agent") if task_record else "unknown_agent"
                    rejected = True
            except Exception as e: self.logger.error(f"Error rejecting task via MoA for {review_id}: {e}")

        # If not rejected via MoA, check decision engine
        if not rejected and self.decision_engine and hasattr(self.decision_engine, 'update_decision_log'):
            try:
                update_payload = {'human_review_status': 'rejected', 'human_reviewer': user_id, 'human_rejection_reason': reason, 'human_review_timestamp': datetime.datetime.now().isoformat()}
                success = await asyncio.to_thread(self.decision_engine.update_decision_log, review_id, update_payload) # Wrap sync call
                if success:
                    self.logger.info(f"Decision {review_id} marked as rejected (placeholder log update).")
                    component_id_for_feedback = "HybridDecisionEngine" # Placeholder
                    rejected = True
            except Exception as e: self.logger.error(f"Error updating decision log for rejection {review_id}: {e}")

        if not rejected: return {"status": "error", "review_id": review_id, "message": "Rejection failed."}

        # Submit feedback async if provided
        if feedback_data and self.validator and hasattr(self.validator, 'submit_feedback'):
            validation_task_id = "general_system_feedback" # Placeholder mapping
            feedback_data_with_context = {"rejection_reason": reason, "rejected_item_id": review_id, "reviewer": user_id, **(feedback_data or {})}
            try:
                # Assume submit_feedback is async or wrap it
                success = await self.validator.submit_feedback(validation_task_id, feedback_data_with_context, source=f"human_rejection:{user_id}")
                self.logger.info(f"Async feedback submission for rejected item {review_id} {'succeeded' if success else 'failed'}.")
            except Exception as e: self.logger.error(f"Error submitting async feedback for rejected item {review_id}: {e}")

        return {"status": "rejected", "review_id": review_id, "message": "Action rejected."}

    # --- Feedback Submission (Async) ---
    async def submit_general_feedback(self, user_id: str, component_id: str, feedback_data: Dict) -> Dict[str, Any]:
        """ Submits general feedback asynchronously. """
        self.logger.info(f"API Call: submit_general_feedback async by {user_id} for {component_id}")
        if not self.validator or not hasattr(self.validator, 'submit_feedback'): return {"status": "failed", "message": "Validation system unavailable."}
        validation_task_id = "general_system_feedback" # Placeholder mapping
        try:
            feedback_data_with_context = {"user_id": user_id, "component_id": component_id, **(feedback_data or {})}
            # Assume submit_feedback is async or wrap it
            success = await self.validator.submit_feedback(validation_task_id, feedback_data_with_context, source=f"human_general:{user_id}")
            feedback_id = feedback_data_with_context.get("feedback_id", "N/A")
            return {"status": "submitted" if success else "failed", "feedback_id": feedback_id}
        except Exception as e: self.logger.error(f"Error submitting async general feedback: {e}"); return {"status": "failed", "message": f"Error: {e}"}

    # --- Goal Management (Async) ---
    async def get_active_goals(self, user_id: str, domain: Optional[str] = None) -> Dict[str, List[Dict]]:
        """ Retrieves active goals asynchronously. """
        self.logger.info(f"API Call: get_active_goals async for user {user_id}, domain {domain}")
        if not self.goal_generator or not hasattr(self.goal_generator, 'get_active_goals'): return {"active_goals": [], "message": "Goal generator unavailable."}
        try:
            # Assume get_active_goals is sync, wrap it
            goals = await asyncio.to_thread(self.goal_generator.get_active_goals, owner_agent=None)
            if domain: goals = [g for g in goals if g.get("domain") == domain]
            serializable_goals = json.loads(json.dumps(goals, default=str))
            return {"active_goals": serializable_goals}
        except Exception as e: self.logger.error(f"Error fetching active goals async: {e}"); return {"active_goals": [], "message": "Error fetching goals."}

    async def add_manual_goal(self, user_id: str, description: str, kpis: List[str], owner_agent: str, priority: float = 0.5, target_values: Optional[Dict] = None) -> Dict[str, Any]:
        """ Allows a human user to add a new goal asynchronously. """
        self.logger.info(f"API Call: add_manual_goal async by {user_id}: {description}")
        if not self.goal_generator or not hasattr(self.goal_generator, 'add_goal'): return {"status": "failed", "message": "Goal generator unavailable."}
        try:
            # Assume add_goal is async (it triggers messages)
            goal_id = await self.goal_generator.add_goal(description, kpis, owner_agent, priority, target_values, source=f"human:{user_id}")
            if goal_id: return {"status": "created", "goal_id": goal_id}
            else: return {"status": "failed", "message": "Failed to add goal."}
        except Exception as e: self.logger.error(f"Error adding manual goal async: {e}"); return {"status": "failed", "message": f"Error: {e}"}

    # --- Explainability Access (Async Wrapper) ---
    async def get_decision_explanation(self, user_id: str, decision_id: str, method: str = "chain_of_thought") -> Dict[str, Any]:
        """ Retrieves an explanation asynchronously. """
        self.logger.info(f"API Call: get_decision_explanation async for {decision_id} by {user_id}, method {method}")
        if not self.xai or not hasattr(self.xai, 'explain_decision'): return {"status": "failed", "message": "XAI system unavailable."}
        try:
            # Wrap sync XAI call
            explanation = await asyncio.to_thread(self.xai.explain_decision, decision_id, method=method)
            if explanation is None or explanation.startswith("Explanation failed"): return {"status": "not_found", "message": explanation or f"Explanation not found/failed."}
            # Handle plot serialization (remains TODO)
            if isinstance(explanation, plt.Figure): plt.close(explanation); return {"explanation_type": "plot", "message": "Plot generated (API needs serialization)."}
            else:
                 try: json.dumps(explanation, default=str); return {"explanation": explanation}
                 except TypeError: return {"status": "failed", "message": "Explanation not serializable."}
        except Exception as e: self.logger.error(f"Error getting explanation async for {decision_id}: {e}"); return {"status": "failed", "message": f"Error: {e}"}

    # --- Task Assignment & Collaboration (Uses MoA System) ---
    async def assign_task_to_agent(self, user_id: str, agent_id: str, task_type: str, task_data: Dict, context: Optional[Dict] = None) -> Dict[str, Any]:
        """ Assigns a task asynchronously via the MoA system. """
        self.logger.info(f"API Call: assign_task_to_agent async {agent_id} by {user_id} for task {task_type}")
        if not self.moa_system or not hasattr(self.moa_system, 'process_external_task'): return {"status": "failed", "message": "MoA system unavailable."}
        try:
            # MIZ 3.0: Add validation if agent_id/task_type is valid before submitting
            # agent_status = self.moa_system.get_agent_status(agent_id) # Sync check ok
            # if not agent_status: return {"status": "failed", "message": f"Agent '{agent_id}' not found."}
            # if task_type not in agent_status.get('capabilities', []): return {"status": "failed", "message": f"Agent '{agent_id}' cannot perform '{task_type}'."}

            # Package task for MoA system's external intake
            external_task_details = {
                "task_type": task_type, # The specific task
                "target_agent_hint": agent_id, # Hint for BossAgent routing
                "task_data": task_data,
                "context": context or {},
                "trigger_source": f"human:{user_id}"
            }
            task_id = await self.moa_system.process_external_task(external_task_details)
            if task_id: return {"status": "submitted", "task_id": task_id} # Submitted to BossAgent
            else: return {"status": "failed", "message": "Failed to submit task to MoA system."}
        except Exception as e: self.logger.error(f"Error assigning task async to agent {agent_id}: {e}"); return {"status": "failed", "message": f"Error: {e}"}

    async def get_task_details(self, user_id: str, task_id: str) -> Dict[str, Any]:
        """ Retrieves task details asynchronously from the MoA system. """
        self.logger.info(f"API Call: get_task_details async for {task_id} by {user_id}")
        if not self.moa_system or not hasattr(self.moa_system, 'get_task_status'): return {"status": "failed", "message": "MoA system unavailable."}
        try:
            task_details = await self.moa_system.get_task_status(task_id)
            if task_details:
                serializable_details = json.loads(json.dumps(task_details, default=str))
                return {"task": serializable_details}
            else: return {"status": "not_found", "message": "Task not found."}
        except Exception as e: self.logger.error(f"Error getting task details async for {task_id}: {e}"); return {"status": "failed", "message": f"Error: {e}"}

# --- Initialization ---
# Assume components are available from previous cells
_decision_engine = hybrid_decision_engine if 'hybrid_decision_engine' in locals() else PlaceholderHDE()
_goal_generator = autonomous_goal_generator if 'autonomous_goal_generator' in locals() else PlaceholderAGG()
_validator = continuous_validation if 'continuous_validation' in locals() else PlaceholderCV()
_xai = xai if 'xai' in locals() else PlaceholderXAI()
_moa_system = miz_moa_system if 'miz_moa_system' in locals() else None # Get from Cell 15

human_agent_interface_api = None
if _moa_system:
    try:
        human_agent_interface_api = HumanAgentInterfaceAPI(
            decision_engine=_decision_engine, goal_generator=_goal_generator,
            validator=_validator, xai=_xai, moa_system=_moa_system # Pass MoA system
        )
        print("--- MIZ 3.0 Human-Agent Interface API Structure Initialized (Async & MoA Integrated) ---")
        # ... (rest of print statements) ...
    except Exception as api_init_e:
         print(f"Error initializing HumanAgentInterfaceAPI: {api_init_e}")
         logger.error(f"HumanAgentInterfaceAPI init failed: {api_init_e}", exc_info=True)
else:
    print(f"Error: MoA System (_moa_system) not available. Cannot initialize Human-Agent Interface API structure.")
    logger.error("MoA System not available for HumanAgentInterfaceAPI.")

# Example Async API Call Simulations
# async def simulate_api_calls():
#     if human_agent_interface_api:
#         print("\nSimulating Async API calls...")
#         pending = await human_agent_interface_api.get_pending_reviews("user_sarah")
#         print(f"Pending Reviews: {json.dumps(pending, indent=2)}")
#         # ... (rest of simulation calls using await) ...
#
# # To run: asyncio.run(simulate_api_calls())

ERROR:MIZ-OKI.HumanAgentInterface:MoA System not available for HumanAgentInterfaceAPI.


Error: MoA System (_moa_system) not available. Cannot initialize Human-Agent Interface API structure.


In [45]:
# Cell 17: MLOps & Training Pipelines (Vertex AI Integration) - Enhanced
# Status: Defines pipeline structure using KFP/Vertex AI v1 components. Components refined with better error handling, scaler logic, and metric logging. MoE update step remains placeholder with detailed comments. Triggering/LLaMA 4 notes added.
# OKI Requirements: Robust pipeline components. Integration with MoE registry update (placeholder implemented). Handling of LLaMA 4 fine-tuning/distillation (notes added). Triggering via LI/CV (notes added).
# Reasoning: This version refines the KFP components for better robustness (error handling in data loading, scaler application, metric logging). It clarifies the MoE update challenge and provides implementation strategy suggestions in comments within the placeholder `update_moe_manager_op`. Notes are added to highlight the need for separate LLaMA 4 pipelines and integration with system triggers (LI/CV). It maintains the use of v1 components for deployment as previously specified.

import kfp
from kfp import dsl
# from kfp.v2 import compiler # Use v2 compiler if using Vertex AI Pipelines v2
from kfp import compiler # Use v1 compiler for broader compatibility initially
# from google_cloud_pipeline_components import aiplatform as gcc_aip # v1 components
from google_cloud_pipeline_components.v1.endpoint import EndpointCreateOp, ModelDeployOp
from google_cloud_pipeline_components.v1.model import ModelUploadOp
from google.cloud import aiplatform
import datetime
import os
import json
import logging
import numpy as np # Added for dtypes check
import pandas as pd # Added for dtypes check

logger = logging.getLogger('MIZ-OKI.MLOps')

# --- Configuration ---
# Assume PROJECT_ID, REGION, BUCKET_NAME are from Cell 1
# Ensure these variables are actually available in the scope
try:
    if 'PROJECT_ID' not in locals() or 'REGION' not in locals() or 'BUCKET_NAME' not in locals():
        raise NameError("PROJECT_ID, REGION, or BUCKET_NAME not defined. Load from Cell 1 config.")
    PIPELINE_ROOT = f"gs://{BUCKET_NAME}/miz3_pipelines"
    TIMESTAMP = datetime.datetime.now().strftime("%Y%m%d%H%M%S")
    logger.info(f"MLOps Pipeline Root: {PIPELINE_ROOT}")
except NameError as ne:
    logger.error(f"MLOps Configuration Error: {ne}. Cannot define pipeline.")
    # Set dummy values to allow script parsing, but pipeline will fail
    PROJECT_ID, REGION, BUCKET_NAME = "dummy-project", "dummy-region", "dummy-bucket"
    PIPELINE_ROOT = "gs://dummy-bucket/miz3_pipelines"
    TIMESTAMP = datetime.datetime.now().strftime("%Y%m%d%H%M%S")


# --- Pipeline Components (Enhanced Placeholders) ---

@kfp.dsl.component(
    base_image="python:3.9", # Specify a base image
    packages_to_install=["google-cloud-storage", "google-cloud-bigquery", "pandas", "scikit-learn", "numpy", "db-dtypes", "joblib"] # Added BQ, db-dtypes, joblib
)
def prepare_data_op(
    # Input parameters
    project_id: str,
    bucket_name: str, # Needed if using DataIngestionPipeline logic (or direct GCS access)
    data_source_type: str, # e.g., 'gcs', 'bq'
    source_uri_or_query: str, # GCS path(s) or BQ query
    target_column: str,
    # Outputs
    output_train_uri: dsl.OutputPath(dsl.Dataset), # Use OutputPath for clarity
    output_test_uri: dsl.OutputPath(dsl.Dataset),
    output_scaler_uri: dsl.OutputPath(dsl.Artifact), # Output scaler for consistency
    # Config
    test_split_ratio: float = 0.2,
    preprocessing_config_json: str = '{}' # Placeholder for more complex config
):
    """Pipeline component to load, preprocess, and split data."""
    import pandas as pd
    from sklearn.model_selection import train_test_split
    from sklearn.preprocessing import StandardScaler # Add other scalers/encoders as needed
    from google.cloud import storage, bigquery
    import logging
    import os
    import joblib # For saving scaler
    import numpy as np # Import numpy
    import json # For preprocessing config

    # Setup logging within the component
    logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')
    logger = logging.getLogger('PrepareDataOp')
    logger.info(f"Starting data preparation. Source type: {data_source_type}")
    logger.info(f"Source: {source_uri_or_query}")

    df = None
    try:
        if data_source_type == 'gcs':
            logger.info(f"Loading data from GCS: {source_uri_or_query}")
            # Basic CSV loading, MIZ 3.0 TODO: Integrate robust GCS reading (like DataIngestionPipeline)
            try:
                 # Handle potential gs:// prefix if not present
                 if not source_uri_or_query.startswith("gs://"):
                      source_uri_or_query = f"gs://{bucket_name}/{source_uri_or_query.lstrip('/')}"
                 df = pd.read_csv(source_uri_or_query)
            except FileNotFoundError:
                 logger.error(f"GCS file not found: {source_uri_or_query}")
                 raise
            except Exception as read_e:
                 logger.error(f"Error reading GCS file {source_uri_or_query}: {read_e}")
                 raise
        elif data_source_type == 'bq':
            logger.info(f"Loading data from BigQuery...")
            bq_client = bigquery.Client(project=project_id)
            try:
                 df = bq_client.query(source_uri_or_query).to_dataframe()
            except Exception as bq_e:
                 logger.error(f"Error executing BigQuery query: {bq_e}")
                 raise
        else:
             raise ValueError(f"Unsupported data_source_type: {data_source_type}")

        if df is None or df.empty:
             raise ValueError("Loaded DataFrame is empty.")

        logger.info(f"Loaded data shape: {df.shape}")
        if target_column not in df.columns:
             raise ValueError(f"Target column '{target_column}' not found in data columns: {df.columns.tolist()}")

        # --- Preprocessing ---
        try:
             prep_config = json.loads(preprocessing_config_json)
        except json.JSONDecodeError:
             logger.warning("Invalid preprocessing_config_json. Using default preprocessing.")
             prep_config = {}

        # Handle missing values (simple mean imputation for numeric)
        numeric_cols = df.select_dtypes(include=np.number).columns.tolist()
        numeric_cols.remove(target_column) if target_column in numeric_cols else None
        for col in numeric_cols:
             if df[col].isnull().any():
                  mean_val = df[col].mean()
                  df[col].fillna(mean_val, inplace=True)
                  logger.info(f"Imputed missing values in numeric '{col}' with mean ({mean_val:.2f}).")
        # MIZ 3.0 TODO: Add imputation for categorical based on prep_config

        # Handle categorical encoding (simple one-hot for MVP)
        categorical_cols = df.select_dtypes(include=['object', 'category']).columns.tolist()
        if categorical_cols:
             logger.info(f"Applying OneHotEncoding to: {categorical_cols}")
             try:
                  df = pd.get_dummies(df, columns=categorical_cols, drop_first=True, dummy_na=False) # Avoid NA columns
                  logger.info(f"Data shape after encoding: {df.shape}")
             except Exception as encode_e:
                  logger.error(f"OneHotEncoding failed: {encode_e}. Check categorical columns.")
                  raise

        # Scaling (only numeric features)
        features = [col for col in df.columns if col != target_column]
        # Re-select numeric features *after* potential encoding
        numeric_features_for_scaling = df[features].select_dtypes(include=np.number).columns.tolist()

        scaler_applied = False
        if not numeric_features_for_scaling:
             logger.warning("No numeric features found for scaling after preprocessing.")
             # Ensure scaler output path is handled gracefully
             os.makedirs(os.path.dirname(output_scaler_uri), exist_ok=True)
             with open(output_scaler_uri, 'w') as f: f.write('{}') # Write empty JSON as placeholder
             logger.info(f"Created empty scaler artifact at {output_scaler_uri}")
        else:
             logger.info(f"Applying StandardScaler to {len(numeric_features_for_scaling)} features...")
             scaler = StandardScaler()
             # Use try-except for transform robustness
             try:
                  df[numeric_features_for_scaling] = scaler.fit_transform(df[numeric_features_for_scaling])
                  logger.info(f"StandardScaler applied.")
                  # Save the scaler
                  os.makedirs(os.path.dirname(output_scaler_uri), exist_ok=True)
                  joblib.dump(scaler, output_scaler_uri)
                  logger.info(f"Scaler saved to {output_scaler_uri}")
                  scaler_applied = True
             except Exception as scale_e:
                  logger.error(f"StandardScaler failed: {scale_e}. Proceeding without scaling.")
                  # Create empty scaler artifact
                  os.makedirs(os.path.dirname(output_scaler_uri), exist_ok=True)
                  with open(output_scaler_uri, 'w') as f: f.write('{}')
                  logger.info(f"Created empty scaler artifact at {output_scaler_uri} due to scaling error.")


        # Split data
        y = df[target_column]
        X = df[features]
        # Stratify for classification tasks if possible
        stratify_col = None
        if pd.api.types.is_categorical_dtype(y) or pd.api.types.is_integer_dtype(y):
             if y.nunique() > 1 and y.nunique() < len(y) // 2: # Heuristic for stratification suitability
                  stratify_col = y
                  logger.info("Attempting stratified split.")

        try:
             X_train, X_test, y_train, y_test = train_test_split(
                 X, y, test_size=test_split_ratio, random_state=42, stratify=stratify_col
             )
             if stratify_col is not None: logger.info("Stratified split performed.")
             else: logger.info("Performed random split.")
        except ValueError as split_e: # Stratify fails if only one class represented etc.
             logger.warning(f"Could not stratify split ({split_e}). Performing random split.")
             X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_split_ratio, random_state=42)

        train_df = pd.concat([X_train, y_train], axis=1)
        test_df = pd.concat([X_test, y_test], axis=1)
        logger.info(f"Split data: Train shape {train_df.shape}, Test shape {test_df.shape}")

        # Save outputs
        os.makedirs(os.path.dirname(output_train_uri), exist_ok=True)
        train_df.to_csv(output_train_uri, index=False)
        logger.info(f"Saved training data to {output_train_uri}")

        os.makedirs(os.path.dirname(output_test_uri), exist_ok=True)
        test_df.to_csv(output_test_uri, index=False)
        logger.info(f"Saved test data to {output_test_uri}")

    except Exception as e:
         logger.error(f"Data preparation failed: {e}", exc_info=True)
         # Ensure output paths are created even on failure? KFP might handle this.
         # Creating empty files might prevent downstream failures but hide the root cause.
         # It's generally better to let the step fail clearly.
         raise

@kfp.dsl.component(
    base_image="tensorflow/tensorflow:2.9.0", # Or match desired TF version
    packages_to_install=["pandas", "joblib", "numpy"] # Added joblib, numpy
)
def train_expert_model_op(
    # Inputs
    train_data: dsl.Input[dsl.Dataset],
    input_scaler_uri: dsl.Input[dsl.Artifact], # Optional scaler input
    target_column: str,
    # Outputs
    model_dir: dsl.Output[dsl.Model], # Output type for Vertex AI Model registry
    # Config
    model_id_prefix: str = "miz3-expert",
    model_version: str = "v1",
    task_type: str = "classification", # 'classification' or 'regression'
    # input_shape_json: str = '[]', # JSON string e.g., '[10]' - Shape derived from data now
    output_shape_json: str = '[1]', # JSON string e.g., '[1]'
    hyperparameters_json: str = '{}', # JSON string for hyperparameters
    epochs: int = 10,
    batch_size: int = 32
):
    """Pipeline component to train an expert model (conceptual MiniModel structure)."""
    import tensorflow as tf
    from tensorflow.keras import layers, models, optimizers
    import pandas as pd
    import json
    import logging
    import os
    import joblib # For loading scaler
    import numpy as np # Import numpy

    logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')
    logger = logging.getLogger('TrainExpertModelOp')
    model_id = f"{model_id_prefix}-{model_version}"
    logger.info(f"Starting model training for {model_id}")

    try:
        # Load data
        train_df = pd.read_csv(train_data.path)
        logger.info(f"Loaded training data: {train_df.shape}")

        if target_column not in train_df.columns:
             raise ValueError(f"Target column '{target_column}' not found.")

        features = [col for col in train_df.columns if col != target_column]
        if not features:
             raise ValueError("No feature columns found after excluding target column.")

        X_train_df = train_df[features] # Keep as DataFrame for scaler
        y_train = train_df[target_column]

        # Load scaler if provided and apply
        scaler = None
        # Check if the scaler artifact path exists and is not empty/dummy
        scaler_path = input_scaler_uri.path
        if os.path.exists(scaler_path) and os.path.getsize(scaler_path) > 2: # Check size > empty {}
            try:
                scaler = joblib.load(scaler_path)
                logger.info(f"Loaded scaler from {scaler_path}")
                numeric_features = X_train_df.select_dtypes(include=np.number).columns.tolist()
                if numeric_features:
                     # Ensure scaler has the same features it was trained on (optional but good practice)
                     # if hasattr(scaler, 'feature_names_in_') and set(numeric_features) != set(scaler.feature_names_in_):
                     #     logger.warning("Scaler features mismatch training data features. Applying anyway.")
                     X_train_df[numeric_features] = scaler.transform(X_train_df[numeric_features])
                     logger.info("Applied loaded scaler to training data.")
                else:
                     logger.warning("Scaler loaded but no numeric features found in training data to apply it to.")
            except FileNotFoundError:
                 logger.warning(f"Scaler file not found at {scaler_path}, though artifact exists. Proceeding without scaling.")
            except Exception as scaler_e:
                 logger.error(f"Failed to load or apply scaler: {scaler_e}. Proceeding without scaling.")
                 scaler = None # Ensure scaler is None if loading failed
        else:
             logger.info("Scaler artifact not found, empty, or not provided. Proceeding without scaling.")

        # Convert features to numpy array *after* potential scaling
        X_train = X_train_df.values
        # Convert target to numpy array
        y_train_values = y_train.values


        # Parse shapes and hyperparameters
        # Derive input shape from data
        input_shape = (X_train.shape[1],)
        logger.info(f"Derived input shape from data: {input_shape}")

        try:
             output_shape = tuple(json.loads(output_shape_json))
        except:
             logger.warning("Invalid output_shape_json. Using default [1].")
             output_shape = (1,)

        try:
             hyperparams = json.loads(hyperparameters_json)
        except json.JSONDecodeError:
             logger.warning("Invalid hyperparameters_json. Using default {}.")
             hyperparams = {}


        # MIZ 3.0: Replicate MiniModel build logic here or load from a shared script/package
        # Build a simple model based on MiniModel concept
        model = models.Sequential(name=model_id)
        model.add(layers.Input(shape=input_shape)) # Use derived input_shape
        hidden_layers = hyperparams.get("hidden_layers", [64, 32])
        activation = hyperparams.get("activation", "relu")
        dropout_rate = hyperparams.get("dropout_rate", 0.0) # Add dropout support

        for units in hidden_layers:
            model.add(layers.Dense(units, activation=activation))
            if dropout_rate > 0:
                 model.add(layers.Dropout(dropout_rate))

        # Adjust output layer based on task type
        output_units = output_shape[0]
        if task_type == "classification":
             final_activation = 'sigmoid' if output_units == 1 else 'softmax'
             loss = 'binary_crossentropy' if output_units == 1 else 'categorical_crossentropy'
             metrics = ['accuracy']
             # Convert labels to categorical for multiclass
             if output_units > 1:
                  num_classes = y_train.nunique() # Infer num_classes from target
                  if num_classes != output_units:
                       logger.warning(f"Inferred {num_classes} classes from target, but output_shape specified {output_units}. Using inferred value.")
                       output_units = num_classes
                  logger.info(f"Converting labels to categorical (num_classes={output_units})")
                  y_train_values = tf.keras.utils.to_categorical(y_train_values, num_classes=output_units)
             elif len(y_train_values.shape) == 1: # Ensure binary target is (batch, 1)
                  y_train_values = np.expand_dims(y_train_values, axis=-1)
        elif task_type == "regression":
             final_activation = 'linear'
             loss = 'mse'
             metrics = ['mae']
             if len(y_train_values.shape) == 1: # Ensure regression target is (batch, output_units)
                  if output_units > 1 and len(y_train_values.shape) == 1:
                       # This case is ambiguous - assuming single output regression if shape is 1D
                       logger.warning("Regression task with 1D target but output_units > 1. Assuming single output.")
                       output_units = 1
                  y_train_values = np.expand_dims(y_train_values, axis=-1)
        else:
             raise ValueError(f"Unsupported task_type: {task_type}")


        model.add(layers.Dense(output_units, activation=final_activation))
        logger.info("Model architecture built.")
        model.summary(print_fn=logger.info)

        # Compile model
        learning_rate = hyperparams.get("learning_rate", 0.001)
        model.compile(optimizer=optimizers.Adam(learning_rate=learning_rate),
                      loss=loss,
                      metrics=metrics)
        logger.info("Model compiled.")

        # Train model
        logger.info(f"Starting training for {epochs} epochs...")
        # Use validation_split for simplicity, or pass separate validation data if prepared
        history = model.fit(X_train, y_train_values, epochs=epochs, batch_size=batch_size, validation_split=0.2, verbose=0)
        logger.info("Training complete.")
        final_loss = history.history['loss'][-1]
        final_val_loss = history.history.get('val_loss', [None])[-1] # Use .get for safety
        logger.info(f"Final training loss: {final_loss:.4f}, Final val_loss: {final_val_loss:.4f}")

        # Save model in TF SavedModel format
        save_path = model_dir.path # Use the path provided by KFP
        model.save(save_path, save_format='tf')
        logger.info(f"Model saved to {save_path}")

        # Add metadata for Vertex AI Model registry
        model_dir.metadata["framework"] = "tensorflow"
        model_dir.metadata["task_type"] = task_type
        model_dir.metadata["model_version"] = model_version
        # Ensure hyperparameters are serializable (should be if loaded from JSON)
        model_dir.metadata["hyperparameters"] = hyperparams
        model_dir.metadata["final_loss"] = float(final_loss) # Ensure float
        if final_val_loss is not None:
             model_dir.metadata["final_val_loss"] = float(final_val_loss)
        # Add primary metric for easy access
        primary_metric = metrics[0] # e.g., 'accuracy' or 'mae'
        final_val_metric = history.history.get(f'val_{primary_metric}', [None])[-1]
        if final_val_metric is not None:
             model_dir.metadata[f"final_val_{primary_metric}"] = float(final_val_metric)

    except Exception as e:
         logger.error(f"Model training failed: {e}", exc_info=True)
         raise

@kfp.dsl.component(
    base_image="tensorflow/tensorflow:2.9.0",
    packages_to_install=["pandas", "scikit-learn", "joblib", "numpy"] # Added joblib, numpy
)
def evaluate_model_op(
    # Inputs
    test_data: dsl.Input[dsl.Dataset],
    model: dsl.Input[dsl.Model],
    input_scaler_uri: dsl.Input[dsl.Artifact], # Optional scaler input
    target_column: str,
    # Outputs
    metrics_output_path: dsl.OutputPath("metrics.json"), # Output metrics as JSON file
    kfp_metrics: dsl.Output[dsl.Metrics] # Output for KFP UI
    # MIZ 3.0 TODO: Add inputs for fairness checks (e.g., sensitive features column name)
):
    """Pipeline component to evaluate the trained model and save metrics."""
    import tensorflow as tf
    import pandas as pd
    from sklearn.metrics import accuracy_score, precision_score, recall_score, roc_auc_score, mean_squared_error, r2_score, f1_score
    import logging
    import json
    import numpy as np # Import numpy
    import joblib # For loading scaler
    import os # Import os

    logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')
    logger = logging.getLogger('EvaluateModelOp')
    logger.info(f"Starting model evaluation using model from {model.uri}") # Use URI

    eval_metrics = {} # Dictionary to store metrics
    try:
        # Load model
        loaded_model = tf.keras.models.load_model(model.path) # Use path to load
        logger.info("Model loaded.")

        # Load test data
        test_df = pd.read_csv(test_data.path)
        logger.info(f"Loaded test data: {test_df.shape}")

        if target_column not in test_df.columns:
             raise ValueError(f"Target column '{target_column}' not found.")

        features = [col for col in test_df.columns if col != target_column]
        if not features:
             raise ValueError("No feature columns found after excluding target column.")

        X_test_df = test_df[features] # Keep as DataFrame for scaler
        y_test = test_df[target_column].values

        # Load scaler if provided and apply
        scaler = None
        scaler_path = input_scaler_uri.path
        if os.path.exists(scaler_path) and os.path.getsize(scaler_path) > 2: # Check size > empty {}
            try:
                scaler = joblib.load(scaler_path)
                logger.info(f"Loaded scaler from {scaler_path}")
                numeric_features = X_test_df.select_dtypes(include=np.number).columns.tolist()
                if numeric_features:
                     X_test_df[numeric_features] = scaler.transform(X_test_df[numeric_features])
                     logger.info("Applied loaded scaler to test data.")
                else:
                     logger.warning("Scaler loaded but no numeric features found in test data to apply it to.")
            except FileNotFoundError:
                 logger.warning(f"Scaler file not found at {scaler_path}, though artifact exists. Proceeding without scaling.")
            except Exception as scaler_e:
                 logger.error(f"Failed to load or apply scaler: {scaler_e}. Proceeding without scaling.")
                 scaler = None
        else:
             logger.info("Scaler artifact not found, empty, or not provided. Proceeding without scaling.")

        X_test_values = X_test_df.values # Convert to numpy array after potential scaling


        # Make predictions
        y_pred_raw = loaded_model.predict(X_test_values, verbose=0)

        # Evaluate based on task type (inferred from model metadata or loss)
        task_type = model.metadata.get("task_type", "unknown")
        if task_type == "unknown":
             try:
                  loss_name = loaded_model.loss if isinstance(loaded_model.loss, str) else loaded_model.loss.__class__.__name__
                  is_classification = "crossentropy" in loss_name.lower()
                  task_type = "classification" if is_classification else "regression"
                  logger.warning(f"Task type not in metadata, inferred as '{task_type}' from loss '{loss_name}'.")
             except Exception as loss_infer_e:
                  logger.error(f"Could not infer task type from model loss: {loss_infer_e}. Cannot evaluate.")
                  raise ValueError("Could not determine task type for evaluation.") from loss_infer_e


        if task_type == "classification":
            output_units = loaded_model.output_shape[-1]
            if output_units == 1: # Binary
                y_pred_proba = y_pred_raw.flatten()
                y_pred_label = (y_pred_proba > 0.5).astype(int)
                eval_metrics['accuracy'] = accuracy_score(y_test, y_pred_label)
                eval_metrics['precision'] = precision_score(y_test, y_pred_label, zero_division=0)
                eval_metrics['recall'] = recall_score(y_test, y_pred_label, zero_division=0)
                eval_metrics['f1_score'] = f1_score(y_test, y_pred_label, zero_division=0)
                try:
                    # Ensure y_test has multiple classes for AUC
                    if len(np.unique(y_test)) > 1:
                         eval_metrics['roc_auc'] = roc_auc_score(y_test, y_pred_proba)
                    else:
                         logger.warning("Only one class present in y_test. ROC AUC score is not defined.")
                         eval_metrics['roc_auc'] = 0.0 # Or np.nan? KFP prefers float.
                except ValueError as auc_e:
                     logger.warning(f"Could not calculate ROC AUC: {auc_e}")
                     eval_metrics['roc_auc'] = 0.0
            else: # Multiclass
                y_pred_label = np.argmax(y_pred_raw, axis=1)
                # Ensure y_test is also integer labels for multiclass metrics
                y_test_labels = y_test
                if len(y_test.shape) > 1 and y_test.shape[1] > 1: # If y_test is one-hot encoded
                     y_test_labels = np.argmax(y_test, axis=1)

                eval_metrics['accuracy'] = accuracy_score(y_test_labels, y_pred_label)
                # Add weighted metrics for multiclass
                eval_metrics['precision_weighted'] = precision_score(y_test_labels, y_pred_label, average='weighted', zero_division=0)
                eval_metrics['recall_weighted'] = recall_score(y_test_labels, y_pred_label, average='weighted', zero_division=0)
                eval_metrics['f1_score_weighted'] = f1_score(y_test_labels, y_pred_label, average='weighted', zero_division=0)
                # Note: ROC AUC is more complex for multiclass
        elif task_type == "regression":
            y_pred_flat = y_pred_raw.flatten()
            y_test_flat = y_test.flatten()
            eval_metrics['mse'] = mean_squared_error(y_test_flat, y_pred_flat)
            eval_metrics['mae'] = np.mean(np.abs(y_test_flat - y_pred_flat))
            eval_metrics['r2_score'] = r2_score(y_test_flat, y_pred_flat)
        else:
             logger.error(f"Cannot evaluate: Unsupported task_type '{task_type}'")
             raise ValueError(f"Unsupported task_type: {task_type}")


        logger.info(f"Evaluation Metrics: {eval_metrics}")

        # Log metrics to KFP UI output
        for name, value in eval_metrics.items():
            # Ensure value is float for KFP
            try:
                 kfp_metrics.log_metric(name, float(value))
            except (ValueError, TypeError) as metric_log_e:
                 logger.warning(f"Could not log metric '{name}' to KFP UI (Value: {value}): {metric_log_e}")


        # Save metrics to JSON file for downstream tasks (like deployment check)
        # Ensure all values are serializable
        serializable_metrics = {}
        for k, v in eval_metrics.items():
             try:
                  serializable_metrics[k] = float(v)
             except (ValueError, TypeError):
                  logger.warning(f"Could not serialize metric '{k}' (Value: {v}) for JSON output. Skipping.")
                  serializable_metrics[k] = str(v) # Store as string as fallback

        with open(metrics_output_path, 'w') as f:
            json.dump(serializable_metrics, f, indent=2)
        logger.info(f"Metrics saved to {metrics_output_path}")

        # MIZ 3.0 TODO: Add fairness/bias evaluation here
        # fairness_metrics = calculate_fairness(test_df, y_pred_label, sensitive_features_column)
        # for name, value in fairness_metrics.items():
        #     kfp_metrics.log_metric(f"fairness_{name}", float(value))

    except Exception as e:
         logger.error(f"Model evaluation failed: {e}", exc_info=True)
         # Write empty metrics file on failure? Or let step fail? Let it fail.
         raise

# MIZ 3.0: Placeholder component to update the MoE Manager registry
@kfp.dsl.component(
     base_image="python:3.9",
     packages_to_install=["google-cloud-aiplatform", "google-cloud-storage", "requests"] # Add potential libs needed to interact with MoE state
)
def update_moe_manager_op(
     project: str,
     location: str,
     expert_id: str, # Unique ID for the expert (e.g., model display name)
     model_resource_name: str, # Output from ModelUploadOp (projects/.../models/...)
     endpoint_resource_name: str, # Output from EndpointCreateOp or existing (projects/.../endpoints/...)
     task_type: str, # From training metadata
     domain: str, # From pipeline parameters
     metrics_json: dsl.Input[dsl.Artifact], # Metrics from evaluation
     # Add other metadata as needed (e.g., GCS path to model card)
     moe_registry_location: str = f"gs://{BUCKET_NAME}/miz3_moe_registry/registry.json" # Example GCS location for shared state
):
    """
    Placeholder component to update the MoEManager registry.
    MIZ 3.0: This needs a robust implementation strategy.
    Option 1 (Shared State - GCS/DB): This component writes/updates an entry in a GCS file or DB table
              that the MoEManager service reads periodically or on startup.
    Option 2 (API Call): This component calls a dedicated API endpoint exposed by the MoEManager service.
    This placeholder simulates Option 1 (writing to GCS).
    """
    import logging
    import json
    import os
    from google.cloud import storage
    from google.cloud import aiplatform

    logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')
    logger = logging.getLogger('UpdateMoEManagerOp')
    logger.info(f"Attempting to update MoE Manager registry for expert {expert_id}.")
    logger.info(f"  Model Resource: {model_resource_name}")
    logger.info(f"  Endpoint Resource: {endpoint_resource_name}")
    logger.info(f"  Task Type: {task_type}, Domain: {domain}")
    logger.info(f"  Metrics Path: {metrics_json.path}")
    logger.info(f"  Registry Location (Target): {moe_registry_location}")

    try:
        # 1. Load evaluation metrics
        eval_metrics = {}
        try:
            with open(metrics_json.path, 'r') as f:
                eval_metrics = json.load(f)
            logger.info(f"Loaded evaluation metrics: {eval_metrics}")
        except Exception as metrics_e:
            logger.error(f"Failed to load metrics from {metrics_json.path}: {metrics_e}")
            # Continue without metrics or fail? Let's continue with a warning.

        # 2. Get Endpoint details (prediction endpoint URI)
        prediction_endpoint_uri = None
        try:
            aiplatform.init(project=project, location=location)
            endpoint = aiplatform.Endpoint(endpoint_resource_name)
            # Find the deployed model ID (might be needed for direct prediction calls)
            deployed_model_id = None
            for deployed_model in endpoint.list_models():
                 # Match based on model resource name (requires parsing)
                 if model_resource_name.split('/')[-1] in deployed_model.model:
                      deployed_model_id = deployed_model.id
                      break
            # Construct the prediction URI (format depends on region and API version)
            # Example format: https://{location}-aiplatform.googleapis.com/v1/{endpoint_resource_name}:predict
            api_endpoint = f"{location}-aiplatform.googleapis.com"
            prediction_endpoint_uri = f"https://{api_endpoint}/v1/{endpoint_resource_name}:predict"
            logger.info(f"Derived Prediction Endpoint URI: {prediction_endpoint_uri}")
            logger.info(f"Deployed Model ID on Endpoint: {deployed_model_id or 'Not Found'}")

        except Exception as endpoint_e:
             logger.error(f"Failed to get endpoint details for {endpoint_resource_name}: {endpoint_e}")
             # Proceed without endpoint URI? Or fail? Let's proceed with warning.

        # 3. Prepare registry entry
        registry_entry = {
            "expert_id": expert_id,
            "model_resource_name": model_resource_name,
            "endpoint_resource_name": endpoint_resource_name,
            "prediction_endpoint_uri": prediction_endpoint_uri, # URI for making predictions
            "deployed_model_id": deployed_model_id, # ID on the endpoint
            "task_type": task_type,
            "domain": domain,
            "status": "active", # Mark as active upon successful deployment
            "last_updated": datetime.datetime.now().isoformat(),
            "evaluation_metrics": eval_metrics,
            # Add other relevant metadata (e.g., training pipeline run ID)
            "pipeline_run_id": os.environ.get('KFP_RUN_ID', 'unknown')
        }

        # 4. Update shared registry (Simulating GCS update)
        logger.info(f"Simulating update to MoE registry at {moe_registry_location}")
        try:
            storage_client = storage.Client(project=project)
            bucket_name = moe_registry_location.split('/')[2]
            blob_name = '/'.join(moe_registry_location.split('/')[3:])
            bucket = storage_client.bucket(bucket_name)
            blob = bucket.blob(blob_name)

            registry_data = {}
            if blob.exists():
                try:
                    registry_data = json.loads(blob.download_as_text())
                    if not isinstance(registry_data, dict):
                         logger.warning("Existing registry is not a dictionary. Overwriting.")
                         registry_data = {}
                except json.JSONDecodeError:
                    logger.warning(f"Could not decode existing registry at {moe_registry_location}. Overwriting.")
                    registry_data = {}
                except Exception as download_e:
                     logger.error(f"Failed to download existing registry: {download_e}. Attempting to overwrite.")
                     registry_data = {}


            # Add or update the entry
            registry_data[expert_id] = registry_entry
            logger.info(f"Updating registry with entry for {expert_id}")

            # Upload updated registry
            blob.upload_from_string(json.dumps(registry_data, indent=2), content_type='application/json')
            logger.info(f"Successfully updated MoE registry at {moe_registry_location}")

        except Exception as gcs_e:
            logger.error(f"Failed to update MoE registry via GCS: {gcs_e}")
            # This step failing might leave the MoE manager unaware of the new model.
            # Consider adding alerting or alternative notification here.
            raise # Re-raise to potentially fail the pipeline step

    except Exception as e:
         logger.error(f"Update MoE Manager operation failed: {e}", exc_info=True)
         raise


# --- Define the Training Pipeline (Using v1 Components for Deployment) ---
@kfp.dsl.pipeline(
    name="miz3-expert-training-pipeline-v1deploy",
    description="Pipeline for training, evaluating, and deploying MIZ 3.0 expert models using v1 deploy components.",
    pipeline_root=PIPELINE_ROOT,
)
def expert_training_pipeline_v1deploy(
    # Project Info
    project: str = PROJECT_ID,
    location: str = REGION, # Use 'location' for v1 components
    # Data Inputs
    data_source_type: str = 'gcs',
    source_uri_or_query: str = f"gs://{BUCKET_NAME}/data/training/dummy_data.csv", # Example input
    target_column: str = 'target',
    # Model Config
    model_display_name_prefix: str = "miz3-expert",
    task_type: str = "classification",
    # input_shape_json: str = '[]', # Shape derived in component now
    output_shape_json: str = '[1]',
    hyperparameters_json: str = '{"learning_rate": 0.001, "hidden_layers": [64, 32]}',
    epochs: int = 20,
    batch_size: int = 32,
    # Deployment Config
    serving_image: str = "us-docker.pkg.dev/vertex-ai/prediction/tf2-cpu.2.9:latest", # Match TF version used in training
    deployment_threshold_metric: str = "accuracy", # Metric from evaluate_model_op output
    deployment_threshold_value: float = 0.80,
    endpoint_display_name_prefix: str = "miz3-expert-endpoint", # Reusable endpoint prefix
    deploy_machine_type: str = "n1-standard-4",
    deploy_traffic_split_json: str = '{"0": 100}', # JSON string for traffic split
    # MIZ 3.0 Config
    expert_domain: str = "default_domain" # Domain for MoE registration
):
    # Generate unique names for this run
    # Use pipeline job ID for uniqueness if available, else timestamp
    # run_id = dsl.PIPELINE_JOB_ID_PLACEHOLDER # Preferred for uniqueness
    # Using timestamp as a simpler alternative for notebook execution
    run_id = datetime.datetime.now().strftime("%Y%m%d%H%M%S")

    model_display_name = f"{model_display_name_prefix}-{run_id}"
    # Create a potentially reusable endpoint name based on the prefix
    endpoint_display_name = f"{endpoint_display_name_prefix}-{model_display_name_prefix}"

    # 1. Prepare Data
    prepare_data_task = prepare_data_op(
        project_id=project,
        bucket_name=BUCKET_NAME, # Pass bucket name
        data_source_type=data_source_type,
        source_uri_or_query=source_uri_or_query,
        target_column=target_column
    )
    # Pass scaler output artifact
    scaler_output = prepare_data_task.outputs["output_scaler_uri"]

    # 2. Train Model
    train_model_task = train_expert_model_op(
        train_data=prepare_data_task.outputs["output_train_uri"],
        input_scaler_uri=scaler_output, # Pass scaler artifact
        target_column=target_column,
        model_id_prefix=model_display_name_prefix, # Pass prefix
        model_version=run_id, # Use run ID as version
        task_type=task_type,
        # input_shape_json=input_shape_json, # Shape derived in component now
        output_shape_json=output_shape_json,
        hyperparameters_json=hyperparameters_json,
        epochs=epochs,
        batch_size=batch_size
    )

    # 3. Evaluate Model
    evaluate_model_task = evaluate_model_op(
        test_data=prepare_data_task.outputs["output_test_uri"],
        model=train_model_task.outputs["model_dir"], # Pass the Model artifact
        input_scaler_uri=scaler_output, # Pass scaler artifact
        target_column=target_column
    )

    # 4. Conditional Deployment
    # Access metric using .outputs['kfp_metrics'].outputs[metric_name]
    # Need to handle potential missing metric if evaluation fails or metric name is wrong
    # Using a placeholder condition for now, as direct metric access can be tricky in KFP SDK
    # A more robust way involves outputting metrics to a file and using another component to read and compare.
    # For simplicity, we use the direct output access, assuming it works in the target KFP version.
    with dsl.Condition(
        evaluate_model_task.outputs['kfp_metrics'].outputs[deployment_threshold_metric] >= deployment_threshold_value,
        name="deploy-condition"
    ):
        # 4a. Upload Model to Vertex AI Model Registry
        model_upload_op = ModelUploadOp(
            project=project,
            location=location,
            display_name=model_display_name,
            artifact_uri=train_model_task.outputs["model_dir"].uri, # Pass the URI of the saved model directory
            serving_container_image_uri=serving_image,
            labels={"miz_pipeline_run_id": run_id, "miz_expert_domain": expert_domain}
        ).after(evaluate_model_task) # Ensure evaluation completes first

        # 4b. Create or Get Endpoint
        endpoint_create_op = EndpointCreateOp(
            project=project,
            location=location,
            display_name=endpoint_display_name,
            labels={"miz_app": "bgi_platform", "miz_domain": expert_domain}
        )

        # 4c. Deploy Model to Endpoint
        model_deploy_op = ModelDeployOp(
            project=project,
            endpoint=endpoint_create_op.outputs["endpoint"],
            model=model_upload_op.outputs["model"],
            deployed_model_display_name=model_display_name,
            machine_type=deploy_machine_type,
            traffic_split=json.loads(deploy_traffic_split_json), # Pass traffic split as dict
        ).after(model_upload_op) # Ensure model upload completes first

        # 4d. Update MoE Manager Registry (Placeholder)
        update_moe_task = update_moe_manager_op(
            project=project,
            location=location,
            expert_id=model_display_name, # Use the unique name as ID
            model_resource_name=model_upload_op.outputs["model"].resource_name, # Pass model resource name
            endpoint_resource_name=endpoint_create_op.outputs["endpoint"].resource_name, # Pass endpoint resource name
            task_type=task_type,
            domain=expert_domain,
            metrics_json=evaluate_model_task.outputs["metrics_output_path"] # Pass metrics file artifact
        ).after(model_deploy_op) # Run after deployment


# --- Compile and Run Pipeline ---
pipeline_filename_v1deploy = "miz3_expert_training_pipeline_v1deploy.json"

# MIZ 3.0 Note: Separate pipelines would be needed for LLaMA 4 fine-tuning or distillation.
# MIZ 3.0 Note: Pipeline triggering should be integrated with LearningIntegration/ContinuousValidation components.

try:
    # Compile the pipeline using v1 compiler for v1 components
    compiler.Compiler(mode=kfp.dsl.PipelineExecutionMode.V1_LEGACY).compile(
        pipeline_func=expert_training_pipeline_v1deploy,
        package_path=pipeline_filename_v1deploy,
    )
    logger.info(f"Pipeline compiled successfully to {pipeline_filename_v1deploy}")

    # --- Optional: Run the pipeline ---
    run_pipeline_v1 = False # Set to True to automatically run

    if run_pipeline_v1:
        logger.info("Submitting pipeline job to Vertex AI...")
        # Ensure aiplatform is initialized (from Cell 1)
        vertex_initialized = False
        try:
             if hasattr(aiplatform, 'initializer') and aiplatform.initializer.global_config.project:
                  vertex_initialized = True
                  logger.info("Vertex AI SDK already initialized.")
             else:
                  aiplatform.init(project=PROJECT_ID, location=REGION)
                  vertex_initialized = True
                  logger.info("Vertex AI SDK initialized.")
        except Exception as init_e:
             logger.error(f"Vertex AI initialization failed: {init_e}. Cannot submit pipeline.")
             # raise RuntimeError("Vertex AI not initialized") from init_e # Optional: Stop execution

        if vertex_initialized:
            # MIZ 3.0 TODO: Ensure dummy data exists at the specified GCS path
            dummy_data_gcs_path = f"gs://{BUCKET_NAME}/data/training/dummy_data.csv"
            logger.warning(f"Pipeline run assumes dummy data exists at: {dummy_data_gcs_path}")
            # MIZ 3.0 TODO: Create dummy data if it doesn't exist
            # try:
            #     storage_client = storage.Client(project=PROJECT_ID)
            #     bucket = storage_client.bucket(BUCKET_NAME)
            #     blob = bucket.blob("data/training/dummy_data.csv")
            #     if not blob.exists():
            #          logger.info(f"Creating dummy data at {dummy_data_gcs_path}")
            #          dummy_df = pd.DataFrame(np.random.rand(100, 11), columns=[f'f{i}' for i in range(10)] + ['target'])
            #          dummy_df['target'] = (dummy_df['target'] > 0.5).astype(int)
            #          blob.upload_from_string(dummy_df.to_csv(index=False), content_type='text/csv')
            # except Exception as dummy_data_e:
            #      logger.error(f"Failed to create/check dummy data: {dummy_data_e}")


            job = aiplatform.PipelineJob(
                display_name=f"miz3-expert-train-v1deploy-{TIMESTAMP}",
                template_path=pipeline_filename_v1deploy,
                pipeline_root=PIPELINE_ROOT,
                parameter_values={ # Example parameters matching the v1deploy pipeline
                    'project': PROJECT_ID,
                    'location': REGION,
                    'data_source_type': 'gcs', # Or 'bq'
                    'source_uri_or_query': dummy_data_gcs_path, # Ensure this exists
                    'target_column': 'target', # Ensure this matches dummy data
                    'model_display_name_prefix': f"miz3-churn-expert", # Prefix for model name
                    'task_type': 'classification',
                    'output_shape_json': '[1]', # Binary classification
                    'hyperparameters_json': '{"learning_rate": 0.001, "hidden_layers": [16, 8]}', # Smaller model for test
                    'epochs': 5, # Fewer epochs for test run
                    'batch_size': 16,
                    'deployment_threshold_metric': 'accuracy',
                    'deployment_threshold_value': 0.60, # Lower threshold for dummy data/quick test
                    'endpoint_display_name_prefix': 'miz3-churn-endpoint', # Reusable endpoint prefix
                    'expert_domain': 'customer_retention' # Domain for MoE
                },
                enable_caching=True, # Enable caching for faster re-runs
            )
            try:
                job.submit()
                logger.info(f"Pipeline job submitted. View in Cloud Console: {job._dashboard_uri()}")
                # job.wait() # Uncomment to wait for completion
                # logger.info("Pipeline job finished.")
            except Exception as job_submit_e:
                 logger.error(f"Failed to submit pipeline job: {job_submit_e}")

    print("\n--- MIZ 3.0 MLOps Pipeline (v1 Deploy) Defined & Compiled ---")
    print(f"Pipeline definition saved to: {pipeline_filename_v1deploy}")
    if run_pipeline_v1 and vertex_initialized:
        print(f"Pipeline job submission attempted to Vertex AI.")
    elif run_pipeline_v1:
        print(f"Pipeline run skipped due to Vertex AI initialization failure.")
    else:
        print("Set 'run_pipeline_v1 = True' and ensure data exists at the specified GCS path to run the pipeline on Vertex AI.")
    print("-------------------------------------------------------------")

except NameError as ne:
     print(f"Error during MLOps setup: {ne}. Please ensure 'kfp', 'google-cloud-pipeline-components', and 'google-cloud-aiplatform' are installed and imported.")
     logger.error(f"MLOps setup failed due to missing libraries or variables: {ne}")
except Exception as e:
    print(f"An error occurred during MLOps pipeline definition or compilation: {e}")
    logger.error("MLOps pipeline definition/compilation failed.", exc_info=True)

ModuleNotFoundError: No module named 'kfp'

In [46]:
# Cell 18: Foundation Model Integration (NN Pillar) - Enhanced
# Status: Phase 1 Fixes Applied. OpenAI/Anthropic implementations added. Retry logic active. Cost tracking basic. LLaMA 4 integration via Vertex placeholders. Long context/multimodal stubs added. JSON parsing refined.
# OKI Requirements: Robust handling of LLaMA 4 Scout/Maverick via Vertex AI. Implementation of methods leveraging long context and multimodality (stubs added). Accurate cost tracking based on LLaMA 4 pricing (placeholder pricing used).
# Reasoning: This version maintains the multi-provider structure. It ensures the Vertex AI call logic can handle LLaMA 4 model IDs (using Llama 3 IDs as placeholders from config). Stubs for long_context and multimodal methods are added with NotImplementedError, clearly marking them as future work but defining the intended interface. JSON parsing in extract_entities/relationships is made more robust. Cost tracking uses placeholder pricing from config, with comments highlighting the need for accurate LLaMA 4 pricing updates.

import os
import requests
import json
import time
import logging
import numpy as np
import pandas as pd # Added for type hints, though not directly used here
import uuid
from typing import Dict, List, Union, Any, Optional
from collections import deque
from google.cloud import aiplatform
from google.cloud import exceptions as gcp_exceptions
# Phase 1: Add OpenAI and Anthropic libraries
try:
    import openai
    from openai import RateLimitError as OpenAIRateLimitError
    from openai import APIError as OpenAIAPIError
    from openai import AuthenticationError as OpenAIAuthError
    OPENAI_AVAILABLE = True
except ImportError:
    print("Warning: 'openai' library not found. Install it (`pip install openai`) for OpenAI integration.")
    OPENAI_AVAILABLE = False
    # Define dummy exceptions if library missing
    class OpenAIRateLimitError(Exception): pass
    class OpenAIAPIError(Exception): pass
    class OpenAIAuthError(Exception): pass

try:
    import anthropic
    from anthropic import RateLimitError as AnthropicRateLimitError
    from anthropic import APIError as AnthropicAPIError
    from anthropic import AuthenticationError as AnthropicAuthError
    ANTHROPIC_AVAILABLE = True
except ImportError:
    print("Warning: 'anthropic' library not found. Install it (`pip install anthropic`) for Anthropic integration.")
    ANTHROPIC_AVAILABLE = False
    # Define dummy exceptions if library missing
    class AnthropicRateLimitError(Exception): pass
    class AnthropicAPIError(Exception): pass
    class AnthropicAuthError(Exception): pass


# Import tenacity for retry logic
try:
    from tenacity import retry, wait_exponential, stop_after_attempt, retry_if_exception_type, RetryError, stop_after_delay
except ImportError:
    print("Warning: 'tenacity' library not found. Install it (`pip install tenacity`) for robust retry logic.")
    # Define dummy retry decorator and exception
    def retry(**kwargs):
        def decorator(func): return func
        return decorator
    class RetryError(Exception): pass
    # Define basic retryable exceptions if tenacity is missing
    RETRYABLE_EXCEPTIONS = (requests.exceptions.ConnectionError, requests.exceptions.Timeout)
else:
    # Define common retryable exceptions for API calls
    RETRYABLE_EXCEPTIONS = (
        requests.exceptions.ConnectionError, requests.exceptions.Timeout, # General network
        gcp_exceptions.ServiceUnavailable, gcp_exceptions.InternalServerError, gcp_exceptions.TooManyRequests, # GCP
        OpenAIRateLimitError, OpenAIAPIError, # OpenAI (retry on general APIError too, might be transient)
        AnthropicRateLimitError, AnthropicAPIError, # Anthropic (retry on general APIError too)
        # Add other potential transient errors if identified (e.g., specific HTTP status codes like 502, 503, 504)
    )

# Ensure logger is configured
logger = logging.getLogger('MIZ-OKI.FoundationModels')

class FoundationModelClient:
    """
    Manages interactions with external foundation models (OpenAI, Anthropic, Vertex AI - including LLaMA 4).
    Phase 1: Implemented OpenAI/Anthropic calls, retry logic, basic cost tracking. LLaMA 4 via Vertex.
    """
    def __init__(self, config: Dict, default_provider: str = "vertex"):
        self.config = config
        self.api_keys = self._load_api_keys()

        # Initialize clients if keys are available
        self.openai_client = None
        if OPENAI_AVAILABLE and "openai" in self.api_keys:
            try:
                self.openai_client = openai.OpenAI(api_key=self.api_keys["openai"])
                self.logger.info("OpenAI client initialized.")
            except Exception as e:
                self.logger.error(f"Failed to initialize OpenAI client: {e}")
                self.api_keys.pop("openai", None) # Mark as unavailable if init fails

        self.anthropic_client = None
        if ANTHROPIC_AVAILABLE and "anthropic" in self.api_keys:
            try:
                self.anthropic_client = anthropic.Anthropic(api_key=self.api_keys["anthropic"])
                self.logger.info("Anthropic client initialized.")
            except Exception as e:
                self.logger.error(f"Failed to initialize Anthropic client: {e}")
                self.api_keys.pop("anthropic", None) # Mark as unavailable

        # Vertex AI Client (SDK initialization check)
        self.vertex_ai_available = False
        try:
             # More robust check for Vertex AI initialization status
             if 'aiplatform' in globals() and hasattr(aiplatform, 'initializer') and aiplatform.initializer.global_config.project:
                  self.vertex_ai_available = True
                  self.api_keys["vertex"] = "gcp_authenticated" # Mark as available
                  self.logger.info("Vertex AI provider available (authenticated via SDK).")
             else:
                  self.logger.warning("Vertex AI SDK not initialized. Vertex provider unavailable.")
        except Exception as e:
             self.logger.warning(f"Error checking Vertex AI initialization: {e}. Vertex provider likely unavailable.")


        # Select default provider based on availability
        resolved_default_provider = default_provider
        # Check if default is available, otherwise find first available fallback
        if default_provider not in self.api_keys:
            fallback_order = ["vertex", "openai", "anthropic"] # Prioritize Vertex
            for provider in fallback_order:
                 if provider in self.api_keys:
                      resolved_default_provider = provider
                      break
            else: # No providers available
                 resolved_default_provider = None

            if resolved_default_provider and resolved_default_provider != default_provider:
                 logger.warning(f"Configured default provider '{default_provider}' not available, falling back to '{resolved_default_provider}'.")
            elif not resolved_default_provider:
                 logger.error("CRITICAL: No foundation model providers configured or available!")
        self.default_provider = resolved_default_provider

        # Load model defaults and pricing from config
        self.default_models = self.config.get("foundation_model_defaults", {})
        self.pricing_data = self.config.get("foundation_model_pricing", {})
        # OKI TODO: Update pricing_data with accurate LLaMA 4 pricing when available.

        self.usage_stats = {provider: {"prompt_tokens": 0, "completion_tokens": 0, "cost": 0.0, "calls": 0, "errors": 0}
                            for provider in ["openai", "anthropic", "vertex"]}
        self.call_history = deque(maxlen=1000) # Store recent call metadata
        self.logger = logging.getLogger('MIZ-OKI.FoundationModels')
        self.logger.info(f"Foundation Model Client initialized. Default: {self.default_provider}. Available: {list(self.api_keys.keys())}")

    def _load_api_keys(self) -> Dict[str, str]:
        """Load API keys from environment variables or config."""
        api_keys = {}
        fm_keys_config = self.config.get("foundation_model_keys", {})

        # OpenAI
        openai_key = os.environ.get("OPENAI_API_KEY") or fm_keys_config.get("openai")
        if openai_key and OPENAI_AVAILABLE:
            api_keys["openai"] = openai_key
            self.logger.info("OpenAI API key loaded.")
        elif not OPENAI_AVAILABLE:
             self.logger.debug("OpenAI library not installed.")
        else:
             self.logger.debug("OpenAI API key not found.")

        # Anthropic
        anthropic_key = os.environ.get("ANTHROPIC_API_KEY") or fm_keys_config.get("anthropic")
        if anthropic_key and ANTHROPIC_AVAILABLE:
            api_keys["anthropic"] = anthropic_key
            self.logger.info("Anthropic API key loaded.")
        elif not ANTHROPIC_AVAILABLE:
             self.logger.debug("Anthropic library not installed.")
        else:
             self.logger.debug("Anthropic API key not found.")

        # Vertex AI availability is checked during __init__
        # No explicit key needed here if SDK is initialized

        if not api_keys and not self.vertex_ai_available:
             self.logger.warning("No API keys found or providers configured successfully.")
        return api_keys

    def _get_model_for_request(self, provider: Optional[str], model_alias: Optional[str]) -> Tuple[Optional[str], Optional[str]]:
        """Resolves the provider and model ID to use for a request."""
        resolved_provider = provider or self.default_provider
        if not resolved_provider or resolved_provider not in self.api_keys:
            available = list(self.api_keys.keys())
            self.logger.error(f"Provider '{resolved_provider}' not available or not configured. Available: {available}")
            return None, None

        # Use model_alias if provided, otherwise use provider default
        model_id = None
        if model_alias:
             # Look up alias in defaults (e.g., "llama4_scout" -> "llama3-8b-instruct")
             model_id = self.default_models.get(model_alias)
             if not model_id:
                  # If alias not found, assume it's a direct model ID
                  model_id = model_alias
                  self.logger.debug(f"Model alias '{model_alias}' not in defaults, treating as direct model ID.")
        else:
             # Get default model for the resolved provider
             model_id = self.default_models.get(resolved_provider)

        if not model_id:
             self.logger.error(f"No model ID resolved for provider '{resolved_provider}' (Alias: {model_alias}).")
             return resolved_provider, None

        return resolved_provider, model_id


    # Apply retry logic using tenacity
    @retry(
        wait=wait_exponential(multiplier=1, min=2, max=30), # Exponential backoff: 2s, 4s, 8s, ... up to 30s
        stop=(stop_after_attempt(4) | stop_after_delay(60)), # Stop after 4 attempts OR 60 seconds
        retry=retry_if_exception_type(RETRYABLE_EXCEPTIONS),
        reraise=True # Re-raise the exception after retries are exhausted
    )
    def generate_text(self, prompt: str,
                     provider: Optional[str] = None,
                     model_alias: Optional[str] = None, # Use alias (e.g., 'llama4_scout') or direct model ID
                     temperature: float = 0.7,
                     max_tokens: int = 1024,
                     system_prompt: Optional[str] = None) -> Optional[str]:
        """Generate text using a foundation model with retry logic."""
        resolved_provider, model_id = self._get_model_for_request(provider, model_alias)
        if not resolved_provider or not model_id:
             # Error already logged in _get_model_for_request
             self.usage_stats.setdefault(resolved_provider or "unknown", {})["errors"] = self.usage_stats.get(resolved_provider or "unknown", {}).get("errors", 0) + 1
             return None

        call_id = f"fm_call_{uuid.uuid4()}"
        call_start = time.time()
        log_entry = {
            "call_id": call_id, "timestamp": call_start, "provider": resolved_provider,
            "model": model_id, "type": "text_generation",
            "prompt_preview": prompt[:100] + "..." if len(prompt) > 100 else prompt,
            "status": "started"
        }
        # Use thread-safe append if client is used across threads, deque is generally safe for single producer/consumer
        self.call_history.append(log_entry)

        response_text = None
        response_metadata = {}

        try:
            self.logger.info(f"Calling {resolved_provider}:{model_id} for text generation (Call ID: {call_id})...")
            if resolved_provider == "openai":
                if not self.openai_client: raise RuntimeError("OpenAI client not initialized.")
                response_text, response_metadata = self._call_openai(prompt, model_id, temperature, max_tokens, system_prompt)
            elif resolved_provider == "anthropic":
                if not self.anthropic_client: raise RuntimeError("Anthropic client not initialized.")
                response_text, response_metadata = self._call_anthropic(prompt, model_id, temperature, max_tokens, system_prompt)
            elif resolved_provider == "vertex":
                if not self.vertex_ai_available: raise RuntimeError("Vertex AI SDK not initialized.")
                response_text, response_metadata = self._call_vertex(prompt, model_id, temperature, max_tokens, system_prompt)
            else:
                # This case should not be reached if _get_model_for_request works correctly
                raise ValueError(f"Unsupported provider: {resolved_provider}")

            call_duration = time.time() - call_start
            log_entry.update({
                "status": "success", "duration": call_duration,
                "response_preview": response_text[:100] + "..." if response_text and len(response_text) > 100 else response_text,
                "metadata": response_metadata
            })
            self._update_usage_stats(resolved_provider, model_id, response_metadata)
            self.logger.info(f"Call {call_id} successful ({call_duration:.2f}s).")
            return response_text

        except RETRYABLE_EXCEPTIONS as retry_e:
             self.logger.warning(f"Retryable error calling {resolved_provider}:{model_id} (Call ID: {call_id}): {retry_e}. Retrying via tenacity...")
             log_entry.update({"status": "retrying", "error": str(retry_e)})
             raise # Reraise to trigger tenacity retry
        except (OpenAIAuthError, AnthropicAuthError, gcp_exceptions.PermissionDenied) as auth_e:
             call_duration = time.time() - call_start
             self.logger.error(f"Authentication error calling {resolved_provider}:{model_id} (Call ID: {call_id}): {auth_e}")
             log_entry.update({"status": "error", "error": f"Authentication Error: {auth_e}", "duration": call_duration})
             self.usage_stats[resolved_provider]["errors"] += 1
             return None # Don't retry auth errors
        except RetryError as final_retry_e: # Catch tenacity's final RetryError
             call_duration = time.time() - call_start
             self.logger.error(f"API call failed after multiple retries for {resolved_provider}:{model_id} (Call ID: {call_id}): {final_retry_e}")
             log_entry.update({"status": "error", "error": f"Failed after retries: {final_retry_e}", "duration": call_duration})
             self.usage_stats[resolved_provider]["errors"] += 1
             return None
        except Exception as e:
            call_duration = time.time() - call_start
            self.logger.error(f"Non-retryable error calling {resolved_provider}:{model_id} (Call ID: {call_id}) after {call_duration:.2f}s: {e}", exc_info=True)
            log_entry.update({"status": "error", "error": str(e), "duration": call_duration})
            self.usage_stats[resolved_provider]["errors"] += 1
            return None

    def _call_openai(self, prompt, model, temperature, max_tokens, system_prompt):
        """Phase 1: Implement OpenAI API call."""
        messages = []
        if system_prompt: messages.append({"role": "system", "content": system_prompt})
        messages.append({"role": "user", "content": prompt})

        response = self.openai_client.chat.completions.create(
            model=model,
            messages=messages,
            temperature=temperature,
            max_tokens=max_tokens
        )

        # Validate response structure
        if not response.choices or not response.choices[0].message or response.choices[0].message.content is None:
             finish_reason = response.choices[0].finish_reason if response.choices else "unknown"
             # Raise APIError which is retryable by default
             raise OpenAIAPIError(f"OpenAI response missing content. Finish reason: {finish_reason}")

        text = response.choices[0].message.content
        metadata = {}
        if response.usage:
             metadata = {"prompt_tokens": response.usage.prompt_tokens, "completion_tokens": response.usage.completion_tokens}
        else:
             self.logger.warning("OpenAI response missing usage data.")
             # Estimate tokens (basic)
             prompt_tokens = len(prompt.split()) # Very rough estimate
             completion_tokens = len(text.split()) # Very rough estimate
             metadata = {"prompt_tokens": prompt_tokens, "completion_tokens": completion_tokens, "estimated": True}

        return text, metadata

    def _call_anthropic(self, prompt, model, temperature, max_tokens, system_prompt):
        """Phase 1: Implement Anthropic API call."""
        response = self.anthropic_client.messages.create(
            model=model,
            system=system_prompt, # Pass system prompt if provided
            messages=[{"role": "user", "content": prompt}],
            temperature=temperature,
            max_tokens=max_tokens
        )

        # Validate response structure
        if not response.content or not response.content[0].text:
             stop_reason = response.stop_reason if response.stop_reason else "unknown"
             # Raise APIError which is retryable by default
             raise AnthropicAPIError(f"Anthropic response missing content. Stop reason: {stop_reason}")

        text = response.content[0].text
        metadata = {}
        if response.usage:
             metadata = {"prompt_tokens": response.usage.input_tokens, "completion_tokens": response.usage.output_tokens}
        else:
             self.logger.warning("Anthropic response missing usage data.")
             prompt_tokens = len(prompt.split()) # Very rough estimate
             completion_tokens = len(text.split()) # Very rough estimate
             metadata = {"prompt_tokens": prompt_tokens, "completion_tokens": completion_tokens, "estimated": True}

        return text, metadata

    def _call_vertex(self, prompt, model, temperature, max_tokens, system_prompt):
        """Call Vertex AI API (Gemini/PaLM/LLaMA) with specific error handling."""
        # OKI: This method should handle Gemini, PaLM, and LLaMA models available on Vertex.
        # The logic uses heuristics based on model name.
        try:
            # Heuristic to check model type (adjust if model names change)
            is_gemini = "gemini" in model.lower()
            is_llama = "llama" in model.lower() # OKI: Check for LLaMA

            if is_gemini or is_llama: # Gemini and LLaMA use the GenerativeModel interface
                 try: from vertexai.generative_models import GenerativeModel, Part, GenerationConfig, HarmCategory, HarmBlockThreshold
                 except ImportError: raise ImportError("Vertex AI GenerativeModels not available. Install 'google-cloud-aiplatform[generative_models]'")

                 # Extract base model ID if version is included (e.g., gemini-1.5-flash-001)
                 model_id_only = model.split('@')[0]
                 vertex_model = GenerativeModel(model_id_only, system_instruction=system_prompt)

                 # Define safety settings (adjust as needed)
                 safety_settings = {
                     HarmCategory.HARM_CATEGORY_HARASSMENT: HarmBlockThreshold.BLOCK_MEDIUM_AND_ABOVE,
                     HarmCategory.HARM_CATEGORY_HATE_SPEECH: HarmBlockThreshold.BLOCK_MEDIUM_AND_ABOVE,
                     HarmCategory.HARM_CATEGORY_SEXUALLY_EXPLICIT: HarmBlockThreshold.BLOCK_MEDIUM_AND_ABOVE,
                     HarmCategory.HARM_CATEGORY_DANGEROUS_CONTENT: HarmBlockThreshold.BLOCK_MEDIUM_AND_ABOVE,
                 }
                 generation_config = GenerationConfig(max_output_tokens=max_tokens, temperature=temperature)

                 # Generate content
                 response = vertex_model.generate_content(
                     [prompt],
                     generation_config=generation_config,
                     safety_settings=safety_settings
                 )

                 # Check for blocked content or empty response
                 if not response.candidates or not response.candidates[0].content.parts:
                      block_reason = "Unknown"
                      finish_reason = "Unknown"
                      if hasattr(response, 'prompt_feedback') and response.prompt_feedback:
                           block_reason = response.prompt_feedback.block_reason
                      if response.candidates and response.candidates[0].finish_reason:
                           finish_reason = response.candidates[0].finish_reason.name
                      # Raise a retryable error for safety blocks or empty responses
                      raise gcp_exceptions.InternalServerError(f"Content blocked or empty response. Block Reason: {block_reason}, Finish Reason: {finish_reason}")

                 response_text = response.text
                 usage_metadata = response.usage_metadata
                 prompt_tokens = usage_metadata.prompt_token_count if usage_metadata else 0
                 completion_tokens = usage_metadata.candidates_token_count if usage_metadata else 0
                 estimated = not usage_metadata # Flag if tokens were estimated

            else: # Assume PaLM or older TextGenerationModel interface
                 # Ensure model name has version if needed (heuristic)
                 model_name_with_version = model
                 if "@" not in model_name_with_version and any(m in model_name_with_version for m in ["text-bison", "text-unicorn"]):
                      model_name_with_version += "@latest"

                 vertex_model = aiplatform.TextGenerationModel.from_pretrained(model_name_with_version)
                 # Combine system prompt with user prompt for PaLM
                 full_prompt = f"System: {system_prompt}\n\nUser: {prompt}" if system_prompt else prompt
                 response = vertex_model.predict(full_prompt, temperature=temperature, max_output_tokens=max_tokens)
                 response_text = response.text
                 # PaLM API doesn't return token counts, estimate crudely
                 prompt_tokens = len(full_prompt.split())
                 completion_tokens = len(response_text.split())
                 estimated = True

            metadata = {"prompt_tokens": prompt_tokens, "completion_tokens": completion_tokens}
            if estimated: metadata["estimated"] = True
            return response_text, metadata

        # Catch specific GCP exceptions for better error handling/retries
        except gcp_exceptions.NotFound as e:
             self.logger.error(f"Vertex AI Model '{model}' not found: {e}")
             raise # Reraise specific errors if needed, or handle non-retryable
        except gcp_exceptions.InvalidArgument as e:
             self.logger.error(f"Invalid argument calling Vertex AI model '{model}': {e}")
             raise # Often non-retryable
        except gcp_exceptions.PermissionDenied as e:
             self.logger.error(f"Permission denied calling Vertex AI model '{model}': {e}")
             raise # Non-retryable auth error
        except RuntimeError as e: # Catch safety blocks raised above
             self.logger.warning(f"Vertex AI call blocked or empty for model '{model}': {e}")
             raise gcp_exceptions.InternalServerError(str(e)) # Treat as potentially retryable server issue
        except Exception as e:
            self.logger.error(f"Unexpected error calling Vertex AI model '{model}': {e}", exc_info=True)
            # Raise a generic retryable error for unexpected issues
            raise gcp_exceptions.InternalServerError(f"Unexpected Vertex AI error: {e}") from e

    def _update_usage_stats(self, provider: str, model: str, metadata: Dict):
        """Update usage and estimated cost."""
        prompt_tokens = metadata.get("prompt_tokens", 0)
        completion_tokens = metadata.get("completion_tokens", 0)
        estimated = metadata.get("estimated", False)

        if provider not in self.usage_stats:
             self.usage_stats[provider] = {"prompt_tokens": 0, "completion_tokens": 0, "cost": 0.0, "calls": 0, "errors": 0}

        self.usage_stats[provider]["prompt_tokens"] += prompt_tokens
        self.usage_stats[provider]["completion_tokens"] += completion_tokens
        self.usage_stats[provider]["calls"] += 1

        # Cost calculation
        provider_pricing = self.pricing_data.get(provider, {})
        model_pricing = provider_pricing.get(model)

        # Fallback pricing lookup (e.g., if versioned model ID used but pricing is for base model)
        if not model_pricing:
             # Try finding pricing for a base model name (e.g., gpt-4-turbo from gpt-4-turbo-2024-04-09)
             # This heuristic might need adjustment based on actual model ID patterns
             base_model_parts = model.split('-')
             if len(base_model_parts) > 1:
                  base_model_guess = '-'.join(base_model_parts[:2]) # e.g., gpt-4, claude-3, gemini-1.5
                  model_pricing = provider_pricing.get(base_model_guess)
                  if not model_pricing and len(base_model_parts) > 2: # Try longer base name
                       base_model_guess = '-'.join(base_model_parts[:3])
                       model_pricing = provider_pricing.get(base_model_guess)


        call_cost = 0.0
        if model_pricing:
            cost_per_prompt = model_pricing.get("prompt", 0.0)
            cost_per_completion = model_pricing.get("completion", 0.0)

            # Determine cost unit (per 1k or 1M tokens) - Heuristic based on price magnitude
            # Anthropic uses $/Million, OpenAI/Vertex often use $/1k
            unit_divisor = 1_000_000.0 if cost_per_prompt >= 1.0 else 1000.0 # Assume >= $1/unit means per Million

            call_cost = (prompt_tokens * cost_per_prompt / unit_divisor) + (completion_tokens * cost_per_completion / unit_divisor)
            self.usage_stats[provider]["cost"] += call_cost
            cost_note = "(Estimated)" if estimated else ""
            self.logger.debug(f"Usage Updated - Provider: {provider}, Model: {model}, P Tokens: {prompt_tokens}, C Tokens: {completion_tokens}, Call Cost: ${call_cost:.6f} {cost_note}, Total Cost: ${self.usage_stats[provider]['cost']:.4f}")
        else:
             self.logger.warning(f"Pricing data not found for {provider}:{model}. Cost estimation unavailable for this call.")
             # Don't add to total cost if pricing is missing


    # --- Task-Specific Methods (Refined JSON Parsing) ---

    def extract_entities(self, text: str, entity_types: Optional[List[str]] = None, provider: Optional[str] = None, model_alias: Optional[str] = None) -> List[Dict]:
        """Extract entities using a structured prompt."""
        resolved_provider, model_id = self._get_model_for_request(provider, model_alias)
        if not resolved_provider or not model_id: return []

        system_prompt = """You are an expert text analysis assistant. Your task is to extract entities from the provided text.
Respond ONLY with a valid JSON list of objects. Each object must have the following keys:
- "entity": The exact text span of the extracted entity.
- "type": The type of the entity (e.g., PERSON, ORGANIZATION, LOCATION, DATE, PRODUCT, etc.).
- "start": The starting character index of the entity in the original text.
- "end": The ending character index (exclusive) of the entity in the original text."""
        if entity_types:
            system_prompt += f"\nFocus ONLY on these entity types: {', '.join(entity_types)}."
        prompt = f"Text to analyze:\n```\n{text}\n```\n\nExtract entities as JSON list:"

        response = self.generate_text(prompt=prompt, system_prompt=system_prompt, provider=resolved_provider, model_alias=model_id, temperature=0.1, max_tokens=1024) # Pass model_id as alias here
        if response is None: return []

        # Robust cleaning and parsing
        cleaned_response = response.strip()
        # Find the start and end of the main JSON list
        start_index = cleaned_response.find('[')
        end_index = cleaned_response.rfind(']')

        if start_index == -1 or end_index == -1 or end_index < start_index:
             self.logger.error(f"Entity extraction response does not contain a valid JSON list structure: '{response}'")
             return []

        json_str = cleaned_response[start_index : end_index + 1]

        try:
            entities = json.loads(json_str)
            if not isinstance(entities, list):
                 self.logger.error(f"Entity extraction result is not a JSON list after parsing: {entities}")
                 return []

            validated_entities = []
            for item in entities:
                 if isinstance(item, dict) and all(k in item for k in ["entity", "type", "start", "end"]):
                      try:
                           # Validate indices are integers and make sense
                           start = int(item['start'])
                           end = int(item['end'])
                           if 0 <= start < end <= len(text): # Basic index validation
                                item['start'] = start
                                item['end'] = end
                                validated_entities.append(item)
                           else:
                                self.logger.warning(f"Skipping entity with invalid indices relative to text length: {item}")
                      except (ValueError, TypeError):
                           self.logger.warning(f"Skipping entity with non-integer index types: {item}")
                 else:
                      self.logger.warning(f"Skipping invalid entity format (missing keys): {item}")

            self.logger.info(f"Extracted {len(validated_entities)} valid entities.")
            return validated_entities
        except json.JSONDecodeError as e:
             self.logger.error(f"Failed to parse entity JSON: {e}. String: '{json_str}'")
             return []
        except Exception as e:
             self.logger.error(f"Unexpected error processing extracted entities: {e}")
             return []


    def summarize(self, text: str, max_length: int = 150, format: str = "paragraph", provider: Optional[str] = None, model_alias: Optional[str] = None) -> Optional[str]:
        """Generate a concise summary."""
        resolved_provider, model_id = self._get_model_for_request(provider, model_alias)
        if not resolved_provider or not model_id: return None

        system_prompt = f"You are an expert summarization assistant. Summarize the following text concisely, aiming for approximately {max_length} words. Format the summary as a {format}."
        prompt = f"Text to summarize:\n```\n{text}\n```\n\nSummary:"
        # Adjust max_tokens based on desired word length (heuristic)
        max_output_tokens = int(max_length * 2.0) # Allow more tokens for generation flexibility

        summary = self.generate_text(prompt=prompt, system_prompt=system_prompt, provider=resolved_provider, model_alias=model_id, temperature=0.3, max_tokens=max_output_tokens)
        if summary:
            self.logger.info(f"Generated summary (approx {len(summary.split())} words).")
            # Optional: Post-process to strictly enforce length if needed
            # summary = " ".join(summary.split()[:max_length])
        return summary

    def extract_relationships(self, text: str, entity_pairs: Optional[List[tuple]] = None, provider: Optional[str] = None, model_alias: Optional[str] = None) -> List[Dict]:
        """Extract relationships (subject-predicate-object triples)."""
        resolved_provider, model_id = self._get_model_for_request(provider, model_alias)
        if not resolved_provider or not model_id: return []

        system_prompt = """You are an expert text analysis assistant. Your task is to extract relationships from the provided text.
Represent relationships as subject-predicate-object triples.
Respond ONLY with a valid JSON list of objects. Each object must have the following keys:
- "subject": The subject entity text.
- "predicate": The verb phrase or relationship description.
- "object": The object entity text."""
        prompt = f"Text to analyze:\n```\n{text}\n```\n\nExtract relationships as JSON list:"
        if entity_pairs:
            prompt += "\n\nFocus specifically on relationships involving these entity pairs (or variations): " + ", ".join([f"({s}, {o})" for s, o in entity_pairs])

        response = self.generate_text(prompt=prompt, system_prompt=system_prompt, provider=resolved_provider, model_alias=model_id, temperature=0.1, max_tokens=1024)
        if response is None: return []

        # Robust cleaning and parsing
        cleaned_response = response.strip()
        start_index = cleaned_response.find('[')
        end_index = cleaned_response.rfind(']')

        if start_index == -1 or end_index == -1 or end_index < start_index:
             self.logger.error(f"Relationship extraction response does not contain a valid JSON list structure: '{response}'")
             return []

        json_str = cleaned_response[start_index : end_index + 1]

        try:
            relationships = json.loads(json_str)
            if not isinstance(relationships, list):
                 self.logger.error(f"Relationship extraction result is not a JSON list after parsing: {relationships}")
                 return []

            validated_rels = []
            for item in relationships:
                 if isinstance(item, dict) and all(k in item for k in ["subject", "predicate", "object"]):
                      # Basic validation: ensure values are strings and not empty
                      if all(isinstance(item[k], str) and item[k].strip() for k in ["subject", "predicate", "object"]):
                           validated_rels.append(item)
                      else:
                           self.logger.warning(f"Skipping relationship with non-string or empty values: {item}")
                 else:
                      self.logger.warning(f"Skipping invalid relationship format (missing keys): {item}")

            self.logger.info(f"Extracted {len(validated_rels)} valid relationships.")
            return validated_rels
        except json.JSONDecodeError as e:
             self.logger.error(f"Failed to parse relationship JSON: {e}. String: '{json_str}'")
             return []
        except Exception as e:
             self.logger.error(f"Unexpected error processing extracted relationships: {e}")
             return []

    # --- MIZ 3.0 OKI Specific Methods (Stubs) ---

    def generate_text_long_context(self, prompt: str, provider: Optional[str] = None, model_alias: Optional[str] = 'llama4_maverick', **kwargs) -> Optional[str]:
        """
        Generates text using a model optimized for long context (e.g., LLaMA 4 Maverick).
        OKI TODO: Implement specific handling if chunking or special API params are needed,
                  otherwise relies on the underlying model's native long context capability.
        """
        self.logger.info(f"Generating text with long context model ({model_alias})...")
        # For now, just call generate_text, assuming the model handles the long context.
        # Add specific logic here if the API requires different handling for >N tokens.
        return self.generate_text(prompt=prompt, provider=provider, model_alias=model_alias, **kwargs)

    def describe_image(self, image_uri: Optional[str] = None, image_bytes: Optional[bytes] = None, prompt: str = "Describe this image in detail.", provider: Optional[str] = 'vertex', model_alias: Optional[str] = 'llama4_maverick', **kwargs) -> Optional[str]:
        """
        Describes an image using a multimodal model (e.g., LLaMA 4).
        OKI TODO: Implement using Vertex AI multimodal API for LLaMA 4.
        """
        self.logger.info(f"Requesting image description using {model_alias}...")
        if not image_uri and not image_bytes:
            self.logger.error("Either image_uri or image_bytes must be provided.")
            return None
        if not self.vertex_ai_available: # Assuming Vertex for LLaMA 4 multimodal initially
             self.logger.error("Vertex AI provider needed for multimodal description is unavailable.")
             return None

        # Placeholder implementation - requires Vertex AI GenerativeModel multimodal call
        self.logger.warning("describe_image method not fully implemented. Requires Vertex AI multimodal API integration.")
        # Example conceptual call structure:
        # try:
        #     from vertexai.generative_models import GenerativeModel, Part, Image
        #     model = GenerativeModel(model_alias) # Use resolved LLaMA 4 model ID
        #     image_part = Part.from_uri(image_uri, mime_type="image/jpeg") if image_uri else Part.from_data(image_bytes, mime_type="image/jpeg") # Adjust mime_type
        #     response = model.generate_content([prompt, image_part], **kwargs)
        #     return response.text
        # except Exception as e:
        #     self.logger.error(f"Multimodal image description failed: {e}")
        #     return None
        raise NotImplementedError("describe_image requires Vertex AI multimodal API implementation.")

    def analyze_video(self, video_uri: str, prompt: str = "Analyze this video.", provider: Optional[str] = 'vertex', model_alias: Optional[str] = 'llama4_maverick', **kwargs) -> Optional[str]:
        """
        Analyzes a video using a multimodal model (e.g., LLaMA 4).
        OKI TODO: Implement using Vertex AI multimodal API for LLaMA 4 video analysis.
        """
        self.logger.info(f"Requesting video analysis using {model_alias}...")
        if not self.vertex_ai_available: # Assuming Vertex for LLaMA 4 multimodal initially
             self.logger.error("Vertex AI provider needed for multimodal video analysis is unavailable.")
             return None

        # Placeholder implementation - requires Vertex AI GenerativeModel multimodal call for video
        self.logger.warning("analyze_video method not fully implemented. Requires Vertex AI multimodal API integration for video.")
        # Example conceptual call structure:
        # try:
        #     from vertexai.generative_models import GenerativeModel, Part
        #     model = GenerativeModel(model_alias) # Use resolved LLaMA 4 model ID
        #     video_part = Part.from_uri(video_uri, mime_type="video/mp4") # Adjust mime_type
        #     response = model.generate_content([prompt, video_part], **kwargs)
        #     return response.text
        # except Exception as e:
        #     self.logger.error(f"Multimodal video analysis failed: {e}")
        #     return None
        raise NotImplementedError("analyze_video requires Vertex AI multimodal API implementation for video.")

    # MIZ 3.0 TODO: Add methods for function calling, embedding generation etc.
    # def generate_embedding(...)
    # def call_function(...)

    # --- Utility Methods ---
    def get_usage_summary(self) -> Dict[str, Dict]:
        """Return summary of API usage and estimated costs."""
        # Return a deep copy to prevent external modification
        return json.loads(json.dumps(self.usage_stats))

    def get_call_history(self, limit: int = 20) -> List[Dict]:
        """Return recent call history."""
        return list(self.call_history)[-limit:]

# --- Integration Helpers (Remains the same conceptually) ---
def create_neural_processing_handlers(fm_client: Optional[FoundationModelClient]):
    """Create handlers for NeuralProcessing component integration."""
    if not fm_client:
        logger.error("FoundationModelClient required for neural processing handlers. Handlers will be non-functional.")
        # Return dummy handlers that log errors
        def error_processor(data): logger.error("FM Client unavailable for processing."); return str(data)
        def error_embedder(data): logger.error("FM Client unavailable for embedding."); return np.random.rand(384) # Return random vector
        return {"processors": {"text": error_processor}, "embedding_models": {"foundation_default": error_embedder}}

    # Define actual handlers using the fm_client
    def foundation_text_processor(text):
        # Simple text cleaning or preparation if needed
        if isinstance(text, str): return text.strip()
        elif isinstance(text, dict): return " ".join([f"{k}:{str(v)[:50]}" for k, v in text.items() if isinstance(v, (str, int, float))])
        else: return str(text)

    def foundation_embedding_func(processed_data):
        # MIZ 3.0 TODO: Implement a dedicated embedding method in fm_client
        # Placeholder: Use generate_text or a specific embedding model if available
        logger.warning("Using placeholder random embeddings in foundation_embedding_func. Implement dedicated embedding method.")
        if isinstance(processed_data, list):
            return [np.random.rand(768) for _ in processed_data] # Example dimension
        else:
            return np.random.rand(768)

    return {
        "processors": {
            "text": foundation_text_processor,
            "customer_profile_text": foundation_text_processor, # Example specific type
        },
        "embedding_models": {
            "foundation_default": foundation_embedding_func
        }
    }

# --- Initialization ---
foundation_model_client = None
if 'CONFIG' in locals():
    try:
        foundation_model_client = FoundationModelClient(CONFIG)

        # --- Integration with other components (if they exist) ---
        # Example: Register handlers with NeuralProcessing (Cell 6)
        if 'neural_processing' in locals() and neural_processing:
            handlers = create_neural_processing_handlers(foundation_model_client)
            for proc_type, proc_func in handlers["processors"].items():
                neural_processing.register_processor(proc_type, proc_func)
            for model_id, embed_func in handlers["embedding_models"].items():
                neural_processing.register_embedding_model(model_id, embed_func)
            logger.info("Foundation Model integration handlers registered with NeuralProcessing.")
        else:
             logger.info("NeuralProcessing component not found. Skipping handler registration.")

        # Example: Inject client into Research Agents (Cell 4)
        if 'autonomous_knowledge_agent' in locals() and autonomous_knowledge_agent:
            # Assuming AKA has an attribute to hold the client
            autonomous_knowledge_agent.foundation_model_client = foundation_model_client
            logger.info("Foundation Model client injected into AutonomousKnowledgeAgent.")
        else:
             logger.info("AutonomousKnowledgeAgent component not found. Skipping client injection.")

        # --- Initialization Message & Basic Tests ---
        print("--- MIZ 3.0 Foundation Model Integration Layer Initialized (Phase 1 - Multi-Provider) ---")
        available_providers = list(foundation_model_client.api_keys.keys())
        print(f"Default Provider: {foundation_model_client.default_provider}")
        print(f"Available Providers: {', '.join(available_providers) if available_providers else 'None Configured'}")
        print(f"Default Models: {foundation_model_client.default_models}")

        # Simple test (using default provider if available)
        if foundation_model_client.default_provider:
             print(f"\nRunning simple generation test using default provider ({foundation_model_client.default_provider})...")
             try:
                  # Use a default model alias if available for the provider
                  default_model_alias = None
                  if foundation_model_client.default_provider in foundation_model_client.default_models:
                       default_model_alias = foundation_model_client.default_provider # e.g., 'vertex' maps to 'gemini...'
                  elif 'llama4_scout' in foundation_model_client.default_models: # Fallback to scout
                       default_model_alias = 'llama4_scout'

                  if default_model_alias:
                       summary = foundation_model_client.generate_text(
                           "Explain Business General Intelligence (BGI) in one sentence.",
                           model_alias=default_model_alias, # Use alias
                           max_tokens=60
                       )
                       print(f"Default Provider Test Summary: {summary or 'Failed'}")
                  else:
                       print("Could not determine default model alias for testing.")

             except Exception as test_e: print(f"Default provider test failed: {test_e}")
        else:
             print("\nSkipping default provider test as none are available.")


        print(f"\nCurrent Usage Stats: {foundation_model_client.get_usage_summary()}")
        print("--------------------------------------------------------------------")

    except Exception as e:
        print(f"An error occurred during Foundation Model Client initialization: {e}")
        logger.error("Foundation Model Client initialization failed.", exc_info=True)
else:
    print("Error: CONFIG dictionary not found. Cannot initialize Foundation Model client.")
    logger.error("CONFIG not found. Skipping Cell 18 execution.")

ERROR:MIZ-OKI.FoundationModels:Foundation Model Client initialization failed.
Traceback (most recent call last):
  File "<ipython-input-46-2b51057af6a1>", line 738, in <cell line: 736>
    foundation_model_client = FoundationModelClient(CONFIG)
  File "<ipython-input-46-2b51057af6a1>", line 80, in __init__
    self.api_keys = self._load_api_keys()
  File "<ipython-input-46-2b51057af6a1>", line 157, in _load_api_keys
    self.logger.debug("OpenAI API key not found.")
AttributeError: 'FoundationModelClient' object has no attribute 'logger'


An error occurred during Foundation Model Client initialization: 'FoundationModelClient' object has no attribute 'logger'
