In [1]:
# CELL 1 - Title and Introduction
# ==================================
# Microsoft Fabric Workspace Access Details to Delta Lake - PySpark Notebook
# This notebook retrieves Microsoft Fabric workspace access details and loads them into Delta Lake tables
# with optimization for analytics workloads using a multi-table approach for different principal types
# 
# The notebook automatically retrieves workspace IDs from the lakehouse and processes each workspace
# ==================================


# CELL 2 - Import Libraries
# ==================================
# Import required libraries
import requests
import json
import pandas as pd
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, current_timestamp, lit, from_json, when, isnotnull
from pyspark.sql.types import StructType, StructField, StringType, TimestampType
import logging
from typing import Dict, List, Optional
from delta.tables import DeltaTable
import random
import time
# ==================================


# CELL 3 - Configure Logging and Initialize Spark
# ==================================
# Configure logging
# This helps us track what's happening in our code and debug issues
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)

# Initialize Spark Session with Delta Lake configurations
# In Fabric notebooks, Spark is pre-configured with Delta support
spark = SparkSession.builder \
    .appName("FabricWorkspaceAccessDetailsToDelta") \
    .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension") \
    .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog") \
    .getOrCreate()

# Set optimal configurations for Delta operations
spark.conf.set("spark.databricks.delta.optimizeWrite.enabled", "true")
spark.conf.set("spark.databricks.delta.autoCompact.enabled", "true")
# ==================================


# CELL 4 - Configuration Parameters
# ==================================
# Configuration Parameters
# These are the settings we'll use throughout the notebook
CONFIG = {
    "API_BASE_URL": "https://api.fabric.microsoft.com/v1",
    "WORKSPACE_ACCESS_ENDPOINT": "/admin/workspaces/{workspaceId}/users",  # Endpoint for workspace access details
    "MAX_RETRIES": 5,  # Increased number of retries for handling rate limits
    "INITIAL_BACKOFF_SEC": 1,  # Initial backoff time in seconds
    "MAX_BACKOFF_SEC": 60,  # Maximum backoff time in seconds
    "BACKOFF_FACTOR": 2,  # Exponential backoff multiplier
    "JITTER_FACTOR": 0.1,  # Random jitter to add to backoff (as a fraction)
    "TIMEOUT": 30,  # API request timeout in seconds
    # Multi-table approach for different principal types
    "ACCESS_CORE_TABLE_NAME": "fabric_workspaces_access_core",  # Core access table
    "ACCESS_USERS_TABLE_NAME": "fabric_workspaces_access_users",  # User details table
    "ACCESS_GROUPS_TABLE_NAME": "fabric_workspaces_access_groups",  # Group details table
    "ACCESS_SERVICE_PRINCIPALS_TABLE_NAME": "fabric_workspaces_access_service_principals",  # Service principal details table
    "ACCESS_SERVICE_PRINCIPAL_PROFILES_TABLE_NAME": "fabric_workspaces_access_service_principal_profiles",  # Service principal profile details table
    "LAKEHOUSE_PATH": "Tables",  # Default Tables folder in Fabric Lakehouse
    "DEBUG_MODE": True,  # Set to True to enable extra debugging output
    # Lakehouse configuration for workspace retrieval
    "WORKSPACE_SOURCE_LAKEHOUSE": "FabricAdmin_Lakehouse",
    "WORKSPACE_SOURCE_SCHEMA": "dbo",
    "WORKSPACE_SOURCE_TABLE": "fabric_workspaces",
    "TARGET_CAPACITY_ID": "C73A5223-9EF6-4514-83CC-3E70297EE377"  # Target capacity ID for filtering workspaces
}
# ==================================


# CELL 5 - Get Workspace IDs from Lakehouse
# ==================================
def get_workspace_ids_from_lakehouse() -> List[str]:
    """
    Retrieve workspace IDs from the lakehouse based on the specified criteria.
    
    This function queries the fabric_workspaces table to get all active, non-personal
    workspaces from the specified capacity.
    
    Returns:
        list: A list of workspace IDs that match the criteria
    """
    logger.info("Retrieving workspace IDs from lakehouse...")
    
    try:
        # Construct the fully qualified table name
        table_name = f"{CONFIG['WORKSPACE_SOURCE_LAKEHOUSE']}.{CONFIG['WORKSPACE_SOURCE_SCHEMA']}.{CONFIG['WORKSPACE_SOURCE_TABLE']}"
        
        # Query to get workspace IDs based on the criteria
        query = f"""
        SELECT id 
        FROM {table_name}
        WHERE state = 'Active'
        AND type <> 'Personal'
        AND capacityId = '{CONFIG['TARGET_CAPACITY_ID']}'
        """
        
        logger.info(f"Executing query: {query}")
        
        # Execute the query
        workspace_df = spark.sql(query)
        
        # Convert to list of workspace IDs
        workspace_ids = [row['id'] for row in workspace_df.collect()]
        
        logger.info(f"Found {len(workspace_ids)} workspaces to process")
        
        # Log first few workspace IDs for verification if in debug mode
        if CONFIG['DEBUG_MODE'] and workspace_ids:
            logger.info(f"Sample workspace IDs: {workspace_ids[:3]}...")
        
        return workspace_ids
        
    except Exception as e:
        logger.error(f"Failed to retrieve workspace IDs from lakehouse: {str(e)}")
        logger.error("Please ensure the table exists and you have appropriate permissions")
        raise
# ==================================


# CELL 6 - Authentication Function
# ==================================
def get_access_token():
    """
    Get Azure AD access token for Fabric API authentication.
    
    In a Fabric notebook, the token is automatically available through mssparkutils.
    This function retrieves the token that's needed to authenticate with the Fabric REST API.
    
    Returns:
        str: The access token
    
    Note:
        mssparkutils is a utility library provided by Microsoft Fabric
        that handles authentication automatically.
    """
    try:
        # In Fabric notebooks, we can get the token using mssparkutils
        from notebookutils import mssparkutils
        token_response = mssparkutils.credentials.getToken("https://api.fabric.microsoft.com")
        return token_response
    except Exception as e:
        logger.error(f"Failed to get access token: {str(e)}")
        raise
# ==================================


# CELL 7 - API Call Function
# ==================================
def call_fabric_api(endpoint: str, access_token: str, params: Optional[Dict] = None) -> Dict:
    """
    Make a REST API call to Microsoft Fabric with advanced rate limit handling.
    
    This function handles the HTTP request to the Fabric API, including:
    - Setting up authentication headers
    - Managing retries with intelligent backoff for rate limiting (429 errors)
    - Implementing jitter to avoid synchronized retries
    - Detailed error handling and logging
    
    Args:
        endpoint: The API endpoint path (e.g., "/admin/workspaces/{workspaceId}/users")
        access_token: The Azure AD access token
        params: Optional query parameters for the API call
    
    Returns:
        dict: The JSON response from the API
    
    Raises:
        requests.exceptions.RequestException: If the API call fails after all retries
    """
    url = f"{CONFIG['API_BASE_URL']}{endpoint}"
    headers = {
        "Authorization": f"Bearer {access_token}",
        "Content-Type": "application/json"
    }
    
    # Initialize backoff time
    backoff_time = CONFIG['INITIAL_BACKOFF_SEC']
    
    # Retry logic with intelligent backoff
    for attempt in range(CONFIG['MAX_RETRIES']):
        try:
            # Log the full URL with parameters for debugging
            logger.info(f"Making API call to: {url} with params: {params} (Attempt {attempt + 1})")
            
            response = requests.get(
                url,
                headers=headers,
                params=params,
                timeout=CONFIG['TIMEOUT']
            )
            
            # Log the full response for debugging
            logger.info(f"Response status: {response.status_code}")
            
            # Rate limit handling (429 Too Many Requests)
            if response.status_code == 429:
                # Get retry-after header if available, otherwise use our backoff
                retry_after = response.headers.get('Retry-After')
                
                if retry_after and retry_after.isdigit():
                    # If server specified a wait time, use it
                    wait_time = int(retry_after)
                else:
                    # Calculate wait time with exponential backoff and jitter
                    jitter = random.uniform(0, CONFIG['JITTER_FACTOR'] * backoff_time)
                    wait_time = backoff_time + jitter
                    # Update backoff for next attempt
                    backoff_time = min(backoff_time * CONFIG['BACKOFF_FACTOR'], CONFIG['MAX_BACKOFF_SEC'])
                
                logger.warning(f"Rate limit exceeded (429). Waiting {wait_time:.2f} seconds before retry.")
                time.sleep(wait_time)
                continue  # Skip to next retry attempt without raising exception
            
            # Log the response for debugging in case of errors
            if response.status_code >= 400:
                logger.error(f"API error: Status {response.status_code}, Response: {response.text}")
                logger.error(f"Request URL: {response.request.url}")
                logger.error(f"Request headers: {response.request.headers}")
            
            # For all other status codes, use raise_for_status to handle
            response.raise_for_status()
            
            # If we get here, the request was successful
            # Reset backoff for next API call (not next attempt)
            backoff_time = CONFIG['INITIAL_BACKOFF_SEC']
            
            # Log a sample of the response content for debugging
            try:
                response_json = response.json()
                if "accessDetails" in response_json and isinstance(response_json["accessDetails"], list):
                    logger.info(f"Response contains {len(response_json['accessDetails'])} items in 'accessDetails' array")
                return response_json
            except json.JSONDecodeError as e:
                logger.error(f"Failed to parse response as JSON: {str(e)}")
                logger.error(f"Response content: {response.text[:1000]}")  # Log first 1000 chars of response
                raise
            
        except requests.exceptions.RequestException as e:
            last_attempt = attempt == CONFIG['MAX_RETRIES'] - 1
            
            # Special handling for non-429 errors
            if not (hasattr(e, 'response') and e.response is not None and e.response.status_code == 429):
                logger.warning(f"API call failed (Attempt {attempt + 1}): {str(e)}")
                
                if last_attempt:
                    logger.error(f"All retry attempts failed for endpoint: {endpoint}")
                    logger.error(f"Final error: {str(e)}")
                    raise
                
                # Calculate wait time with exponential backoff and jitter
                jitter = random.uniform(0, CONFIG['JITTER_FACTOR'] * backoff_time)
                wait_time = backoff_time + jitter
                # Update backoff for next attempt
                backoff_time = min(backoff_time * CONFIG['BACKOFF_FACTOR'], CONFIG['MAX_BACKOFF_SEC'])
                
                logger.info(f"Waiting {wait_time:.2f} seconds before retry.")
                time.sleep(wait_time)
# ==================================


# CELL 8 - Get Workspace Access Details Function
# ==================================
def get_workspace_access_details(access_token: str, workspace_id: str) -> List[Dict]:
    """
    Retrieve workspace access details for a specific workspace from the Fabric API.
    
    This function makes a request to the List Workspace Access Details API endpoint
    to get all users, groups, and service principals that have access to the workspace.
    
    Args:
        access_token: The Azure AD access token
        workspace_id: The workspace ID to get access details for
    
    Returns:
        list: A list of all access detail objects containing principal and workspace access information
    """
    logger.info(f"Retrieving access details for workspace: {workspace_id}")
    
    # Construct the endpoint with the workspace ID
    endpoint = CONFIG['WORKSPACE_ACCESS_ENDPOINT'].format(workspaceId=workspace_id)
    
    try:
        # Make the API call
        response_data = call_fabric_api(endpoint, access_token)
        
        # Log the response structure for debugging
        if CONFIG['DEBUG_MODE']:
            logger.info(f"Response keys: {list(response_data.keys())}")
        
        # Extract access details from the response
        access_details = response_data.get("accessDetails", [])
        
        if access_details:
            logger.info(f"Retrieved {len(access_details)} access details for workspace {workspace_id}")
            
            # Log first access detail for debugging
            if CONFIG['DEBUG_MODE'] and access_details:
                logger.info(f"Sample access detail: {json.dumps(access_details[0], indent=2)}")
        else:
            logger.warning(f"No access details found for workspace {workspace_id}")
        
        return access_details
        
    except requests.exceptions.RequestException as e:
        logger.error(f"API call failed for workspace {workspace_id}: {str(e)}")
        raise
# ==================================


# CELL 9 - Create Core Access DataFrame Function
# ==================================
def create_core_access_dataframe(access_details: List[Dict], workspace_id: str) -> "DataFrame":
    """
    Convert the access details data into a core access PySpark DataFrame.
    
    This function creates the main access table containing the core information
    that's common across all principal types.
    
    Args:
        access_details: List of access detail dictionaries from the API
        workspace_id: The workspace ID this data relates to
    
    Returns:
        DataFrame: A PySpark DataFrame with core access information
    """
    logger.info("Creating core access DataFrame")
    
    # Extract core access information from each access detail
    core_access_data = []
    
    for access_detail in access_details:
        principal = access_detail.get("principal", {})
        workspace_access = access_detail.get("workspaceAccessDetails", {})
        
        core_record = {
            "workspace_id": workspace_id,
            "principal_id": principal.get("id"),
            "principal_display_name": principal.get("displayName"),
            "principal_type": principal.get("type"),
            "workspace_role": workspace_access.get("workspaceRole"),
            "workspace_type": workspace_access.get("type")
        }
        core_access_data.append(core_record)
    
    # Define the schema for the core access table
    schema = StructType([
        StructField("workspace_id", StringType(), False),
        StructField("principal_id", StringType(), False),
        StructField("principal_display_name", StringType(), True),
        StructField("principal_type", StringType(), False),
        StructField("workspace_role", StringType(), True),
        StructField("workspace_type", StringType(), True),
        StructField("last_updated_timestamp", TimestampType(), False)
    ])
    
    # Create DataFrame
    if not core_access_data:
        logger.warning("No access details found. Creating empty core DataFrame.")
        empty_rdd = spark.sparkContext.emptyRDD()
        core_df = spark.createDataFrame(empty_rdd, schema)
        return core_df
    
    # Convert to pandas DataFrame first, then to Spark DataFrame
    pandas_df = pd.DataFrame(core_access_data)
    
    # Create the initial Spark DataFrame
    required_columns = ["workspace_id", "principal_id", "principal_display_name", "principal_type", "workspace_role", "workspace_type"]
    
    # Ensure all columns exist in the pandas DataFrame
    for col_name in required_columns:
        if col_name not in pandas_df.columns:
            pandas_df[col_name] = None
    
    spark_df = spark.createDataFrame(pandas_df[required_columns])
    
    # Add metadata column for tracking when this data was extracted
    enhanced_df = spark_df.withColumn("last_updated_timestamp", current_timestamp())
    
    logger.info(f"Created core access DataFrame with {enhanced_df.count()} records")
    return enhanced_df
# ==================================


# CELL 10 - Create Principal-Specific DataFrames Functions
# ==================================
def create_user_details_dataframe(access_details: List[Dict]) -> "DataFrame":
    """
    Create a DataFrame specifically for user principal details.
    
    Args:
        access_details: List of access detail dictionaries from the API
    
    Returns:
        DataFrame: A PySpark DataFrame with user-specific details
    """
    logger.info("Creating user details DataFrame")
    
    user_details_data = []
    
    for access_detail in access_details:
        principal = access_detail.get("principal", {})
        
        # Only process User principals
        if principal.get("type") == "User":
            user_details = principal.get("userDetails", {})
            
            user_record = {
                "principal_id": principal.get("id"),
                "user_principal_name": user_details.get("userPrincipalName")
            }
            user_details_data.append(user_record)
    
    # Define schema
    schema = StructType([
        StructField("principal_id", StringType(), False),
        StructField("user_principal_name", StringType(), True),
        StructField("last_updated_timestamp", TimestampType(), False)
    ])
    
    # Create DataFrame
    if not user_details_data:
        logger.info("No user principals found. Creating empty user details DataFrame.")
        empty_rdd = spark.sparkContext.emptyRDD()
        user_df = spark.createDataFrame(empty_rdd, schema)
        return user_df
    
    pandas_df = pd.DataFrame(user_details_data)
    spark_df = spark.createDataFrame(pandas_df)
    enhanced_df = spark_df.withColumn("last_updated_timestamp", current_timestamp())
    
    logger.info(f"Created user details DataFrame with {enhanced_df.count()} records")
    return enhanced_df


def create_group_details_dataframe(access_details: List[Dict]) -> "DataFrame":
    """
    Create a DataFrame specifically for group principal details.
    
    Args:
        access_details: List of access detail dictionaries from the API
    
    Returns:
        DataFrame: A PySpark DataFrame with group-specific details
    """
    logger.info("Creating group details DataFrame")
    
    group_details_data = []
    
    for access_detail in access_details:
        principal = access_detail.get("principal", {})
        
        # Only process Group principals
        if principal.get("type") == "Group":
            group_details = principal.get("groupDetails", {})
            
            group_record = {
                "principal_id": principal.get("id"),
                "group_type": group_details.get("groupType")
            }
            group_details_data.append(group_record)
    
    # Define schema
    schema = StructType([
        StructField("principal_id", StringType(), False),
        StructField("group_type", StringType(), True),
        StructField("last_updated_timestamp", TimestampType(), False)
    ])
    
    # Create DataFrame
    if not group_details_data:
        logger.info("No group principals found. Creating empty group details DataFrame.")
        empty_rdd = spark.sparkContext.emptyRDD()
        group_df = spark.createDataFrame(empty_rdd, schema)
        return group_df
    
    pandas_df = pd.DataFrame(group_details_data)
    spark_df = spark.createDataFrame(pandas_df)
    enhanced_df = spark_df.withColumn("last_updated_timestamp", current_timestamp())
    
    logger.info(f"Created group details DataFrame with {enhanced_df.count()} records")
    return enhanced_df


def create_service_principal_details_dataframe(access_details: List[Dict]) -> "DataFrame":
    """
    Create a DataFrame specifically for service principal details.
    
    Args:
        access_details: List of access detail dictionaries from the API
    
    Returns:
        DataFrame: A PySpark DataFrame with service principal-specific details
    """
    logger.info("Creating service principal details DataFrame")
    
    sp_details_data = []
    
    for access_detail in access_details:
        principal = access_detail.get("principal", {})
        
        # Only process ServicePrincipal principals
        if principal.get("type") == "ServicePrincipal":
            sp_details = principal.get("servicePrincipalDetails", {})
            
            sp_record = {
                "principal_id": principal.get("id"),
                "aad_app_id": sp_details.get("aadAppId")
            }
            sp_details_data.append(sp_record)
    
    # Define schema
    schema = StructType([
        StructField("principal_id", StringType(), False),
        StructField("aad_app_id", StringType(), True),
        StructField("last_updated_timestamp", TimestampType(), False)
    ])
    
    # Create DataFrame
    if not sp_details_data:
        logger.info("No service principals found. Creating empty service principal details DataFrame.")
        empty_rdd = spark.sparkContext.emptyRDD()
        sp_df = spark.createDataFrame(empty_rdd, schema)
        return sp_df
    
    pandas_df = pd.DataFrame(sp_details_data)
    spark_df = spark.createDataFrame(pandas_df)
    enhanced_df = spark_df.withColumn("last_updated_timestamp", current_timestamp())
    
    logger.info(f"Created service principal details DataFrame with {enhanced_df.count()} records")
    return enhanced_df


def create_service_principal_profile_details_dataframe(access_details: List[Dict]) -> "DataFrame":
    """
    Create a DataFrame specifically for service principal profile details.
    
    Args:
        access_details: List of access detail dictionaries from the API
    
    Returns:
        DataFrame: A PySpark DataFrame with service principal profile-specific details
    """
    logger.info("Creating service principal profile details DataFrame")
    
    spp_details_data = []
    
    for access_detail in access_details:
        principal = access_detail.get("principal", {})
        
        # Only process ServicePrincipalProfile principals
        if principal.get("type") == "ServicePrincipalProfile":
            spp_details = principal.get("servicePrincipalProfileDetails", {})
            parent_principal = spp_details.get("parentPrincipal", {})
            
            spp_record = {
                "principal_id": principal.get("id"),
                "parent_principal_id": parent_principal.get("id"),
                "parent_principal_display_name": parent_principal.get("displayName"),
                "parent_principal_type": parent_principal.get("type")
            }
            spp_details_data.append(spp_record)
    
    # Define schema
    schema = StructType([
        StructField("principal_id", StringType(), False),
        StructField("parent_principal_id", StringType(), True),
        StructField("parent_principal_display_name", StringType(), True),
        StructField("parent_principal_type", StringType(), True),
        StructField("last_updated_timestamp", TimestampType(), False)
    ])
    
    # Create DataFrame
    if not spp_details_data:
        logger.info("No service principal profiles found. Creating empty service principal profile details DataFrame.")
        empty_rdd = spark.sparkContext.emptyRDD()
        spp_df = spark.createDataFrame(empty_rdd, schema)
        return spp_df
    
    pandas_df = pd.DataFrame(spp_details_data)
    spark_df = spark.createDataFrame(pandas_df)
    enhanced_df = spark_df.withColumn("last_updated_timestamp", current_timestamp())
    
    logger.info(f"Created service principal profile details DataFrame with {enhanced_df.count()} records")
    return enhanced_df
# ==================================


# CELL 11 - Delta Lake Operations Functions
# ==================================
def ensure_delta_table_exists(table_name: str, df_schema):
    """
    Ensure the Delta table exists, creating it if necessary.
    
    Args:
        table_name: Name of the Delta table
        df_schema: Schema of the DataFrame
    """
    try:
        # Check if table exists
        spark.sql(f"DESCRIBE TABLE {table_name}")
        logger.info(f"Delta table '{table_name}' already exists")
    except Exception:
        # Table doesn't exist, create it
        logger.info(f"Creating Delta table '{table_name}'")
        
        # Create an empty DataFrame with the schema
        empty_df = spark.createDataFrame([], df_schema)
        
        # Create the Delta table
        empty_df.write \
            .mode("overwrite") \
            .option("overwriteSchema", "true") \
            .saveAsTable(table_name)
        
        logger.info(f"Delta table '{table_name}' created successfully")


def merge_core_access_data_to_delta(source_df, table_name: str):
    """
    Merge new core access data into the Delta table using MERGE operation.
    
    This function performs an upsert operation:
    - Updates existing records if workspace_id + principal_id matches
    - Inserts new records if the combination doesn't exist
    
    Args:
        source_df: DataFrame with new data
        table_name: Name of the target Delta table
    """
    logger.info(f"Starting merge operation for {table_name}")
    
    # Create a temporary view for the merge operation
    source_df.createOrReplaceTempView("access_updates")
    
    # If the table is empty, just insert all records
    if spark.table(table_name).count() == 0:
        logger.info(f"Table {table_name} is empty. Inserting all records.")
        source_df.write.mode("append").saveAsTable(table_name)
        return
    
    # Perform the merge operation (composite key: workspace_id + principal_id)
    merge_query = f"""
    MERGE INTO {table_name} AS target
    USING access_updates AS source
    ON target.workspace_id = source.workspace_id AND target.principal_id = source.principal_id
    WHEN MATCHED THEN
        UPDATE SET 
            target.principal_display_name = source.principal_display_name,
            target.principal_type = source.principal_type,
            target.workspace_role = source.workspace_role,
            target.workspace_type = source.workspace_type,
            target.last_updated_timestamp = source.last_updated_timestamp
    WHEN NOT MATCHED THEN
        INSERT *
    """
    
    spark.sql(merge_query)
    logger.info("Core access merge operation completed successfully")


def merge_principal_details_to_delta(source_df, table_name: str):
    """
    Merge principal-specific details into the Delta table using MERGE operation.
    
    This function performs an upsert operation based on principal_id.
    
    Args:
        source_df: DataFrame with new data
        table_name: Name of the target Delta table
    """
    logger.info(f"Starting merge operation for {table_name}")
    
    # Create a temporary view for the merge operation
    temp_view_name = f"{table_name.replace('fabric_', '')}_updates"
    source_df.createOrReplaceTempView(temp_view_name)
    
    # If the table is empty, just insert all records
    if spark.table(table_name).count() == 0:
        logger.info(f"Table {table_name} is empty. Inserting all records.")
        source_df.write.mode("append").saveAsTable(table_name)
        return
    
    # Get column names excluding principal_id and last_updated_timestamp for dynamic UPDATE SET
    columns = [col for col in source_df.columns if col not in ['principal_id', 'last_updated_timestamp']]
    update_set_clause = ", ".join([f"target.{col} = source.{col}" for col in columns])
    update_set_clause += ", target.last_updated_timestamp = source.last_updated_timestamp"
    
    # Perform the merge operation
    merge_query = f"""
    MERGE INTO {table_name} AS target
    USING {temp_view_name} AS source
    ON target.principal_id = source.principal_id
    WHEN MATCHED THEN
        UPDATE SET {update_set_clause}
    WHEN NOT MATCHED THEN
        INSERT *
    """
    
    spark.sql(merge_query)
    logger.info(f"Principal details merge operation completed successfully for {table_name}")


def optimize_delta_table(table_name: str):
    """
    Optimize the Delta table for better query performance.
    
    Args:
        table_name: Name of the Delta table to optimize
    """
    logger.info(f"Optimizing Delta table '{table_name}'")
    
    try:
        # Update table statistics for better query planning
        spark.sql(f"ANALYZE TABLE {table_name} COMPUTE STATISTICS")
        logger.info(f"Table statistics updated successfully for {table_name}")
        
        logger.info(f"Delta table optimization completed for {table_name}")
    except Exception as e:
        logger.warning(f"Table optimization step encountered an issue for {table_name}: {str(e)}")
        logger.info("Continuing with process - optimization is not critical for functionality")
# ==================================


# CELL 12 - Process Single Workspace Function
# ==================================
def process_single_workspace(workspace_id: str, access_token: str) -> Dict:
    """
    Process access details for a single workspace.
    
    This function handles the complete processing for one workspace:
    1. Retrieves access details from the API
    2. Creates DataFrames for different principal types
    3. Merges data into Delta tables
    
    Args:
        workspace_id: The workspace ID to process
        access_token: The Azure AD access token
    
    Returns:
        dict: Summary information about the processed workspace
    """
    logger.info(f"Processing workspace: {workspace_id}")
    
    try:
        # Retrieve access details
        access_details = get_workspace_access_details(access_token, workspace_id)
        
        if not access_details:
            logger.warning(f"No access details found for workspace {workspace_id}")
            return {
                "workspace_id": workspace_id,
                "status": "no_data",
                "records_processed": 0
            }
        
        # Create DataFrames
        core_access_df = create_core_access_dataframe(access_details, workspace_id)
        user_details_df = create_user_details_dataframe(access_details)
        group_details_df = create_group_details_dataframe(access_details)
        sp_details_df = create_service_principal_details_dataframe(access_details)
        spp_details_df = create_service_principal_profile_details_dataframe(access_details)
        
        # Process each table
        table_configs = [
            (CONFIG["ACCESS_CORE_TABLE_NAME"], core_access_df, merge_core_access_data_to_delta),
            (CONFIG["ACCESS_USERS_TABLE_NAME"], user_details_df, merge_principal_details_to_delta),
            (CONFIG["ACCESS_GROUPS_TABLE_NAME"], group_details_df, merge_principal_details_to_delta),
            (CONFIG["ACCESS_SERVICE_PRINCIPALS_TABLE_NAME"], sp_details_df, merge_principal_details_to_delta),
            (CONFIG["ACCESS_SERVICE_PRINCIPAL_PROFILES_TABLE_NAME"], spp_details_df, merge_principal_details_to_delta)
        ]
        
        records_processed = 0
        
        for table_name, df, merge_function in table_configs:
            # Ensure table exists
            ensure_delta_table_exists(table_name, df.schema)
            
            # Merge data if DataFrame has records
            if df.count() > 0:
                merge_function(df, table_name)
                records_processed += df.count()
        
        logger.info(f"Successfully processed workspace {workspace_id} with {core_access_df.count()} access records")
        
        return {
            "workspace_id": workspace_id,
            "status": "success",
            "records_processed": core_access_df.count(),
            "user_count": user_details_df.count(),
            "group_count": group_details_df.count(),
            "service_principal_count": sp_details_df.count(),
            "service_principal_profile_count": spp_details_df.count()
        }
        
    except Exception as e:
        logger.error(f"Error processing workspace {workspace_id}: {str(e)}")
        return {
            "workspace_id": workspace_id,
            "status": "error",
            "error": str(e)
        }
# ==================================


# CELL 13 - Main Execution Function
# ==================================
def main():
    """
    Main execution function that orchestrates the entire process.
    
    This function:
    1. Gets the authentication token
    2. Retrieves workspace IDs from the lakehouse
    3. Processes each workspace in sequence
    4. Provides summary statistics
    
    Returns:
        list: Summary information for all processed workspaces
    """
    try:
        logger.info("Starting Fabric Workspace Access Details to Delta Lake process")
        
        # Step 1: Get authentication token
        logger.info("Getting access token...")
        access_token = get_access_token()
        logger.info("Successfully obtained access token")
        
        # Step 2: Get workspace IDs from lakehouse
        logger.info("Retrieving workspace IDs from lakehouse...")
        workspace_ids = get_workspace_ids_from_lakehouse()
        
        if not workspace_ids:
            logger.warning("No workspace IDs found matching the criteria. Please check your lakehouse query.")
            return []
        
        logger.info(f"Found {len(workspace_ids)} workspaces to process")
        
        # Step 3: Process each workspace
        logger.info("Starting to process workspaces...")
        processing_results = []
        successful_workspaces = 0
        failed_workspaces = 0
        total_records_processed = 0
        
        for idx, workspace_id in enumerate(workspace_ids, 1):
            logger.info(f"Processing workspace {idx} of {len(workspace_ids)}: {workspace_id}")
            
            # Process the workspace
            result = process_single_workspace(workspace_id, access_token)
            processing_results.append(result)
            
            # Update counters
            if result["status"] == "success":
                successful_workspaces += 1
                total_records_processed += result.get("records_processed", 0)
            elif result["status"] == "error":
                failed_workspaces += 1
            
            # Add a small delay between workspaces to avoid rate limiting
            if idx < len(workspace_ids):
                time.sleep(0.5)
        
        # Step 4: Display final statistics
        logger.info("=" * 60)
        logger.info("PROCESSING COMPLETED")
        logger.info("=" * 60)
        logger.info(f"Total workspaces processed: {len(workspace_ids)}")
        logger.info(f"Successful: {successful_workspaces}")
        logger.info(f"Failed: {failed_workspaces}")
        logger.info(f"No data: {len(workspace_ids) - successful_workspaces - failed_workspaces}")
        logger.info(f"Total access records processed: {total_records_processed}")
        
        # Show table statistics
        logger.info("\n=== DELTA TABLE STATISTICS ===")
        
        table_names = [
            CONFIG["ACCESS_CORE_TABLE_NAME"],
            CONFIG["ACCESS_USERS_TABLE_NAME"],
            CONFIG["ACCESS_GROUPS_TABLE_NAME"],
            CONFIG["ACCESS_SERVICE_PRINCIPALS_TABLE_NAME"],
            CONFIG["ACCESS_SERVICE_PRINCIPAL_PROFILES_TABLE_NAME"]
        ]
        
        for table_name in table_names:
            try:
                row_count = spark.table(table_name).count()
                logger.info(f"{table_name}: {row_count} rows")
            except Exception as e:
                logger.warning(f"Could not get statistics for {table_name}: {str(e)}")
        
        # Optimize all tables
        logger.info("\n=== OPTIMIZING DELTA TABLES ===")
        for table_name in table_names:
            optimize_delta_table(table_name)
        
        # Return processing results for further analysis
        return processing_results
        
    except Exception as e:
        logger.error(f"Error in main execution: {str(e)}")
        raise
# ==================================


# CELL 14 - Execute Main Function
# ==================================
# Execute the main function
if __name__ == "__main__":
    # Process all workspaces from the lakehouse
    processing_results = main()
    
    # Display detailed results if needed
    if CONFIG['DEBUG_MODE'] and processing_results:
        logger.info("\n=== DETAILED PROCESSING RESULTS ===")
        for result in processing_results[:10]:  # Show first 10 results
            logger.info(f"Workspace {result['workspace_id']}: {result['status']}")
            if result['status'] == 'success':
                logger.info(f"  - Records: {result['records_processed']}")
                logger.info(f"  - Users: {result.get('user_count', 0)}")
                logger.info(f"  - Groups: {result.get('group_count', 0)}")
# ==================================



StatementMeta(, 40cfb356-a779-4bc9-8371-bcd62cbbd3fc, 3, Finished, Available, Finished)

2025-05-27 19:39:16,548 - INFO - Starting Fabric Workspace Access Details to Delta Lake process
2025-05-27 19:39:16,549 - INFO - Getting access token...
2025-05-27 19:39:17,392 - INFO - Successfully obtained access token
2025-05-27 19:39:17,393 - INFO - Retrieving workspace IDs from lakehouse...
2025-05-27 19:39:17,394 - INFO - Retrieving workspace IDs from lakehouse...
2025-05-27 19:39:17,395 - INFO - Executing query: 
        SELECT id 
        FROM FabricAdmin_Lakehouse.dbo.fabric_workspaces
        WHERE state = 'Active'
        AND type <> 'Personal'
        AND capacityId = 'C73A5223-9EF6-4514-83CC-3E70297EE377'
        
2025-05-27 19:39:30,864 - INFO - Found 37 workspaces to process
2025-05-27 19:39:30,865 - INFO - Sample workspace IDs: ['263575b7-d1ee-4dcc-bd05-ca67158e1bb4', 'be770ec6-b0f5-4323-9efd-1554201ff583', '73fac372-490a-4068-a2aa-856837eb6a6b']...
2025-05-27 19:39:30,865 - INFO - Found 37 workspaces to process
2025-05-27 19:39:30,867 - INFO - Starting to process works