In [7]:
# DO NOT DELETE THIS CELL

# ***IMPORTANT***: This is a work in progress! These results are filtered . See line 69 for the query and pay attention to the WHERE clause.

# API Name: Workspaces - List Workspace Access Details
# Command:  GET https://api.fabric.microsoft.com/v1/admin/workspaces/{workspaceId}/users
# Doc:      https://learn.microsoft.com/en-us/rest/api/fabric/admin/workspaces/list-workspace-access-details

# Note: this queries the [fabric_workspaces] table to get a list of [workspaceId] values for the API calls.

# Loads tables: fabric_workspaces_access_core
# Loads tables: fabric_workspaces_access_users
# Loads tables: fabric_workspaces_access_groups
# Loads tables: fabric_workspaces_access_service_principals
# Loads tables: fabric_workspaces_access_service_principal_profiles

StatementMeta(, 19f02840-53b3-4187-a034-d02b3e514cb5, 9, Finished, Available, Finished)

In [8]:
# CELL 1 - Title and Introduction
# ==================================
# Microsoft Fabric Workspace Access Details to Delta Lake - PySpark Notebook
# This notebook retrieves Microsoft Fabric workspace access details for multiple workspaces
# and loads them into Delta Lake tables with optimization for analytics workloads using a multi-table approach
# Process: Query lakehouse for workspace IDs → Process each workspace sequentially → Load to Delta tables
# ==================================


# CELL 2 - Import Libraries
# ==================================
# Import required libraries
import requests
import json
import pandas as pd
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, current_timestamp, lit, from_json, when, isnotnull
from pyspark.sql.types import StructType, StructField, StringType, TimestampType
import logging
from typing import Dict, List, Optional
from delta.tables import DeltaTable
import random
import time
# ==================================


# CELL 3 - Configure Logging and Initialize Spark
# ==================================
# Configure logging
# This helps us track what's happening in our code and debug issues
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)

# Initialize Spark Session with Delta Lake configurations
# In Fabric notebooks, Spark is pre-configured with Delta support
spark = SparkSession.builder \
    .appName("FabricWorkspaceAccessDetailsToDelta") \
    .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension") \
    .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog") \
    .getOrCreate()

# Set optimal configurations for Delta operations
spark.conf.set("spark.databricks.delta.optimizeWrite.enabled", "true")
spark.conf.set("spark.databricks.delta.autoCompact.enabled", "true")
# ==================================


# CELL 4 - Configuration Parameters
# ==================================
# Configuration Parameters
# These are the settings we'll use throughout the notebook
CONFIG = {
    "API_BASE_URL": "https://api.fabric.microsoft.com/v1",
    "WORKSPACE_ACCESS_ENDPOINT": "/admin/workspaces/{workspaceId}/users",  # Endpoint for workspace access details
    "MAX_RETRIES": 5,  # Increased number of retries for handling rate limits
    "INITIAL_BACKOFF_SEC": 1,  # Initial backoff time in seconds
    "MAX_BACKOFF_SEC": 60,  # Maximum backoff time in seconds
    "BACKOFF_FACTOR": 2,  # Exponential backoff multiplier
    "JITTER_FACTOR": 0.1,  # Random jitter to add to backoff (as a fraction)
    "TIMEOUT": 30,  # API request timeout in seconds
    # Multi-table approach for different principal types
    "ACCESS_CORE_TABLE_NAME": "fabric_workspaces_access_core",  # Core access table
    "ACCESS_USERS_TABLE_NAME": "fabric_workspaces_access_users",  # User details table
    "ACCESS_GROUPS_TABLE_NAME": "fabric_workspaces_access_groups",  # Group details table
    "ACCESS_SERVICE_PRINCIPALS_TABLE_NAME": "fabric_workspaces_access_service_principals",  # Service principal details table
    "ACCESS_SERVICE_PRINCIPAL_PROFILES_TABLE_NAME": "fabric_workspaces_access_service_principal_profiles",  # Service principal profile details table
    "LAKEHOUSE_PATH": "Tables",  # Default Tables folder in Fabric Lakehouse
    "DEBUG_MODE": True,  # Set to True to enable extra debugging output
    # Workspace query parameters
    "WORKSPACE_QUERY": """
    SELECT 
    fw.id
    FROM 
    FabricAdmin_Lakehouse.dbo.fabric_capacities AS fc
    JOIN 
    FabricAdmin_Lakehouse.dbo.fabric_workspaces AS fw
    ON UPPER(fc.id) = fw.capacityId  -- Join on capacityId
    WHERE 
    fc.displayName = 'MDA Institutional Capacity - PROD'  -- Filter for specific displayName
    AND fw.state = 'Active'  -- Filter for active workspaces
    AND fw.type <> 'Personal'  -- Exclude personal workspaces
ORDER BY
    fw.name
    """,
    # Rate limiting - API allows 200 requests per hour
    "INTER_WORKSPACE_DELAY_SEC": 5,  # Wait 20 seconds between workspace API calls to respect rate limits
}

logger.info("Configuration initialized successfully")
# ==================================


# CELL 5 - Authentication Function
# ==================================
def get_access_token():
    """
    Get Azure AD access token for Fabric API authentication.
    
    In a Fabric notebook, the token is automatically available through mssparkutils.
    This function retrieves the token that's needed to authenticate with the Fabric REST API.
    
    Returns:
        str: The access token
    
    Note:
        mssparkutils is a utility library provided by Microsoft Fabric
        that handles authentication automatically.
    """
    try:
        # In Fabric notebooks, we can get the token using mssparkutils
        from notebookutils import mssparkutils
        token_response = mssparkutils.credentials.getToken("https://api.fabric.microsoft.com")
        return token_response
    except Exception as e:
        logger.error(f"Failed to get access token: {str(e)}")
        raise
# ==================================


# CELL 6 - Workspace Query Function
# ==================================
def get_workspace_ids_from_lakehouse() -> List[str]:
    """
    Query the lakehouse to get the list of workspace IDs to process.
    
    This function executes the SQL query against the fabric_workspaces table
    to retrieve workspace IDs that match our criteria:
    - Active state
    - Not Personal workspaces
    - Specific capacity ID
    
    Returns:
        List[str]: List of workspace IDs to process
    
    Raises:
        Exception: If the query fails or returns no results
    """
    logger.info("Querying lakehouse for workspace IDs...")
    
    try:
        # Execute the query to get workspace IDs
        workspace_df = spark.sql(CONFIG["WORKSPACE_QUERY"])
        
        # Convert to list of workspace IDs
        workspace_ids = [row['id'] for row in workspace_df.collect()]
        
        logger.info(f"Found {len(workspace_ids)} workspaces to process")
        
        if CONFIG['DEBUG_MODE'] and workspace_ids:
            logger.info(f"First few workspace IDs: {workspace_ids[:5]}")
        
        if not workspace_ids:
            logger.warning("No workspaces found matching the criteria. Please check the query and data.")
            return []
        
        return workspace_ids
        
    except Exception as e:
        logger.error(f"Failed to query workspace IDs from lakehouse: {str(e)}")
        logger.error("Please ensure the FabricAdmin_Lakehouse and fabric_workspaces table exist and are accessible")
        raise
# ==================================


# CELL 7 - API Call Function
# ==================================
def call_fabric_api(endpoint: str, access_token: str, params: Optional[Dict] = None) -> Dict:
    """
    Make a REST API call to Microsoft Fabric with advanced rate limit handling.
    
    This function handles the HTTP request to the Fabric API, including:
    - Setting up authentication headers
    - Managing retries with intelligent backoff for rate limiting (429 errors)
    - Implementing jitter to avoid synchronized retries
    - Detailed error handling and logging
    
    Args:
        endpoint: The API endpoint path (e.g., "/admin/workspaces/{workspaceId}/users")
        access_token: The Azure AD access token
        params: Optional query parameters for the API call
    
    Returns:
        dict: The JSON response from the API
    
    Raises:
        requests.exceptions.RequestException: If the API call fails after all retries
    """
    url = f"{CONFIG['API_BASE_URL']}{endpoint}"
    headers = {
        "Authorization": f"Bearer {access_token}",
        "Content-Type": "application/json"
    }
    
    # Initialize backoff time
    backoff_time = CONFIG['INITIAL_BACKOFF_SEC']
    
    # Retry logic with intelligent backoff
    for attempt in range(CONFIG['MAX_RETRIES']):
        try:
            # Log the full URL with parameters for debugging
            logger.info(f"Making API call to: {url} with params: {params} (Attempt {attempt + 1})")
            
            response = requests.get(
                url,
                headers=headers,
                params=params,
                timeout=CONFIG['TIMEOUT']
            )
            
            # Log the response status for debugging
            logger.info(f"Response status: {response.status_code}")
            
            # Rate limit handling (429 Too Many Requests)
            if response.status_code == 429:
                # Get retry-after header if available, otherwise use our backoff
                retry_after = response.headers.get('Retry-After')
                
                if retry_after and retry_after.isdigit():
                    # If server specified a wait time, use it
                    wait_time = int(retry_after)
                else:
                    # Calculate wait time with exponential backoff and jitter
                    jitter = random.uniform(0, CONFIG['JITTER_FACTOR'] * backoff_time)
                    wait_time = backoff_time + jitter
                    # Update backoff for next attempt
                    backoff_time = min(backoff_time * CONFIG['BACKOFF_FACTOR'], CONFIG['MAX_BACKOFF_SEC'])
                
                logger.warning(f"Rate limit exceeded (429). Waiting {wait_time:.2f} seconds before retry.")
                time.sleep(wait_time)
                continue  # Skip to next retry attempt without raising exception
            
            # Log the response for debugging in case of errors
            if response.status_code >= 400:
                logger.error(f"API error: Status {response.status_code}, Response: {response.text}")
                logger.error(f"Request URL: {response.request.url}")
                logger.error(f"Request headers: {response.request.headers}")
            
            # For all other status codes, use raise_for_status to handle
            response.raise_for_status()
            
            # If we get here, the request was successful
            # Reset backoff for next API call (not next attempt)
            backoff_time = CONFIG['INITIAL_BACKOFF_SEC']
            
            # Log a sample of the response content for debugging
            try:
                response_json = response.json()
                if "accessDetails" in response_json and isinstance(response_json["accessDetails"], list):
                    logger.info(f"Response contains {len(response_json['accessDetails'])} items in 'accessDetails' array")
                return response_json
            except json.JSONDecodeError as e:
                logger.error(f"Failed to parse response as JSON: {str(e)}")
                logger.error(f"Response content: {response.text[:1000]}")  # Log first 1000 chars of response
                raise
            
        except requests.exceptions.RequestException as e:
            last_attempt = attempt == CONFIG['MAX_RETRIES'] - 1
            
            # Special handling for non-429 errors
            if not (hasattr(e, 'response') and e.response is not None and e.response.status_code == 429):
                logger.warning(f"API call failed (Attempt {attempt + 1}): {str(e)}")
                
                if last_attempt:
                    logger.error(f"All retry attempts failed for endpoint: {endpoint}")
                    logger.error(f"Final error: {str(e)}")
                    raise
                
                # Calculate wait time with exponential backoff and jitter
                jitter = random.uniform(0, CONFIG['JITTER_FACTOR'] * backoff_time)
                wait_time = backoff_time + jitter
                # Update backoff for next attempt
                backoff_time = min(backoff_time * CONFIG['BACKOFF_FACTOR'], CONFIG['MAX_BACKOFF_SEC'])
                
                logger.info(f"Waiting {wait_time:.2f} seconds before retry.")
                time.sleep(wait_time)
# ==================================


# CELL 8 - Get Workspace Access Details Function
# ==================================
def get_workspace_access_details(access_token: str, workspace_id: str) -> List[Dict]:
    """
    Retrieve workspace access details for a specific workspace from the Fabric API.
    
    This function makes a request to the List Workspace Access Details API endpoint
    to get all users, groups, and service principals that have access to the workspace.
    
    Args:
        access_token: The Azure AD access token
        workspace_id: The workspace ID to get access details for
    
    Returns:
        list: A list of all access detail objects containing principal and workspace access information
    """
    logger.info(f"Retrieving access details for workspace: {workspace_id}")
    
    # Construct the endpoint with the workspace ID
    endpoint = CONFIG['WORKSPACE_ACCESS_ENDPOINT'].format(workspaceId=workspace_id)
    
    try:
        # Make the API call
        response_data = call_fabric_api(endpoint, access_token)
        
        # Log the response structure for debugging
        if CONFIG['DEBUG_MODE']:
            logger.info(f"Response keys: {list(response_data.keys())}")
        
        # Extract access details from the response
        access_details = response_data.get("accessDetails", [])
        
        if access_details:
            logger.info(f"Retrieved {len(access_details)} access details for workspace {workspace_id}")
            
            # Log first access detail for debugging
            if CONFIG['DEBUG_MODE'] and access_details:
                logger.info(f"Sample access detail: {json.dumps(access_details[0], indent=2)}")
        else:
            logger.warning(f"No access details found for workspace {workspace_id}")
        
        return access_details
        
    except requests.exceptions.RequestException as e:
        logger.error(f"API call failed for workspace {workspace_id}: {str(e)}")
        # Re-raise the exception so calling function can handle it
        raise
# ==================================


# CELL 9 - Create Core Access DataFrame Function
# ==================================
def create_core_access_dataframe(access_details: List[Dict], workspace_id: str) -> "DataFrame":
    """
    Convert the access details data into a core access PySpark DataFrame.
    
    This function creates the main access table containing the core information
    that's common across all principal types.
    
    Args:
        access_details: List of access detail dictionaries from the API
        workspace_id: The workspace ID this data relates to
    
    Returns:
        DataFrame: A PySpark DataFrame with core access information
    """
    logger.info("Creating core access DataFrame")
    
    # Extract core access information from each access detail
    core_access_data = []
    
    for access_detail in access_details:
        principal = access_detail.get("principal", {})
        workspace_access = access_detail.get("workspaceAccessDetails", {})
        
        core_record = {
            "workspace_id": workspace_id,
            "principal_id": principal.get("id"),
            "principal_display_name": principal.get("displayName"),
            "principal_type": principal.get("type"),
            "workspace_role": workspace_access.get("workspaceRole"),
            "workspace_type": workspace_access.get("type")
        }
        core_access_data.append(core_record)
    
    # Define the schema for the core access table
    schema = StructType([
        StructField("workspace_id", StringType(), False),
        StructField("principal_id", StringType(), False),
        StructField("principal_display_name", StringType(), True),
        StructField("principal_type", StringType(), False),
        StructField("workspace_role", StringType(), True),
        StructField("workspace_type", StringType(), True),
        StructField("last_updated_timestamp", TimestampType(), False)
    ])
    
    # Create DataFrame
    if not core_access_data:
        logger.warning("No access details found. Creating empty core DataFrame.")
        empty_rdd = spark.sparkContext.emptyRDD()
        core_df = spark.createDataFrame(empty_rdd, schema)
        return core_df
    
    # Convert to pandas DataFrame first, then to Spark DataFrame
    pandas_df = pd.DataFrame(core_access_data)
    
    # Create the initial Spark DataFrame
    required_columns = ["workspace_id", "principal_id", "principal_display_name", "principal_type", "workspace_role", "workspace_type"]
    
    # Ensure all columns exist in the pandas DataFrame
    for col_name in required_columns:
        if col_name not in pandas_df.columns:
            pandas_df[col_name] = None
    
    spark_df = spark.createDataFrame(pandas_df[required_columns])
    
    # Add metadata column for tracking when this data was extracted
    enhanced_df = spark_df.withColumn("last_updated_timestamp", current_timestamp())
    
    logger.info(f"Created core access DataFrame with {enhanced_df.count()} records")
    return enhanced_df
# ==================================


# CELL 10 - Create Principal-Specific DataFrames Functions
# ==================================
def create_user_details_dataframe(access_details: List[Dict]) -> "DataFrame":
    """
    Create a DataFrame specifically for user principal details.
    
    Args:
        access_details: List of access detail dictionaries from the API
    
    Returns:
        DataFrame: A PySpark DataFrame with user-specific details
    """
    logger.info("Creating user details DataFrame")
    
    user_details_data = []
    
    for access_detail in access_details:
        principal = access_detail.get("principal", {})
        
        # Only process User principals
        if principal.get("type") == "User":
            user_details = principal.get("userDetails", {})
            
            user_record = {
                "principal_id": principal.get("id"),
                "user_principal_name": user_details.get("userPrincipalName")
            }
            user_details_data.append(user_record)
    
    # Define schema
    schema = StructType([
        StructField("principal_id", StringType(), False),
        StructField("user_principal_name", StringType(), True),
        StructField("last_updated_timestamp", TimestampType(), False)
    ])
    
    # Create DataFrame
    if not user_details_data:
        logger.info("No user principals found. Creating empty user details DataFrame.")
        empty_rdd = spark.sparkContext.emptyRDD()
        user_df = spark.createDataFrame(empty_rdd, schema)
        return user_df
    
    pandas_df = pd.DataFrame(user_details_data)
    spark_df = spark.createDataFrame(pandas_df)
    enhanced_df = spark_df.withColumn("last_updated_timestamp", current_timestamp())
    
    logger.info(f"Created user details DataFrame with {enhanced_df.count()} records")
    return enhanced_df


def create_group_details_dataframe(access_details: List[Dict]) -> "DataFrame":
    """
    Create a DataFrame specifically for group principal details.
    
    Args:
        access_details: List of access detail dictionaries from the API
    
    Returns:
        DataFrame: A PySpark DataFrame with group-specific details
    """
    logger.info("Creating group details DataFrame")
    
    group_details_data = []
    
    for access_detail in access_details:
        principal = access_detail.get("principal", {})
        
        # Only process Group principals
        if principal.get("type") == "Group":
            group_details = principal.get("groupDetails", {})
            
            group_record = {
                "principal_id": principal.get("id"),
                "group_type": group_details.get("groupType")
            }
            group_details_data.append(group_record)
    
    # Define schema
    schema = StructType([
        StructField("principal_id", StringType(), False),
        StructField("group_type", StringType(), True),
        StructField("last_updated_timestamp", TimestampType(), False)
    ])
    
    # Create DataFrame
    if not group_details_data:
        logger.info("No group principals found. Creating empty group details DataFrame.")
        empty_rdd = spark.sparkContext.emptyRDD()
        group_df = spark.createDataFrame(empty_rdd, schema)
        return group_df
    
    pandas_df = pd.DataFrame(group_details_data)
    spark_df = spark.createDataFrame(pandas_df)
    enhanced_df = spark_df.withColumn("last_updated_timestamp", current_timestamp())
    
    logger.info(f"Created group details DataFrame with {enhanced_df.count()} records")
    return enhanced_df


def create_service_principal_details_dataframe(access_details: List[Dict]) -> "DataFrame":
    """
    Create a DataFrame specifically for service principal details.
    
    Args:
        access_details: List of access detail dictionaries from the API
    
    Returns:
        DataFrame: A PySpark DataFrame with service principal-specific details
    """
    logger.info("Creating service principal details DataFrame")
    
    sp_details_data = []
    
    for access_detail in access_details:
        principal = access_detail.get("principal", {})
        
        # Only process ServicePrincipal principals
        if principal.get("type") == "ServicePrincipal":
            sp_details = principal.get("servicePrincipalDetails", {})
            
            sp_record = {
                "principal_id": principal.get("id"),
                "aad_app_id": sp_details.get("aadAppId")
            }
            sp_details_data.append(sp_record)
    
    # Define schema
    schema = StructType([
        StructField("principal_id", StringType(), False),
        StructField("aad_app_id", StringType(), True),
        StructField("last_updated_timestamp", TimestampType(), False)
    ])
    
    # Create DataFrame
    if not sp_details_data:
        logger.info("No service principals found. Creating empty service principal details DataFrame.")
        empty_rdd = spark.sparkContext.emptyRDD()
        sp_df = spark.createDataFrame(empty_rdd, schema)
        return sp_df
    
    pandas_df = pd.DataFrame(sp_details_data)
    spark_df = spark.createDataFrame(pandas_df)
    enhanced_df = spark_df.withColumn("last_updated_timestamp", current_timestamp())
    
    logger.info(f"Created service principal details DataFrame with {enhanced_df.count()} records")
    return enhanced_df


def create_service_principal_profile_details_dataframe(access_details: List[Dict]) -> "DataFrame":
    """
    Create a DataFrame specifically for service principal profile details.
    
    Args:
        access_details: List of access detail dictionaries from the API
    
    Returns:
        DataFrame: A PySpark DataFrame with service principal profile-specific details
    """
    logger.info("Creating service principal profile details DataFrame")
    
    spp_details_data = []
    
    for access_detail in access_details:
        principal = access_detail.get("principal", {})
        
        # Only process ServicePrincipalProfile principals
        if principal.get("type") == "ServicePrincipalProfile":
            spp_details = principal.get("servicePrincipalProfileDetails", {})
            parent_principal = spp_details.get("parentPrincipal", {})
            
            spp_record = {
                "principal_id": principal.get("id"),
                "parent_principal_id": parent_principal.get("id"),
                "parent_principal_display_name": parent_principal.get("displayName"),
                "parent_principal_type": parent_principal.get("type")
            }
            spp_details_data.append(spp_record)
    
    # Define schema
    schema = StructType([
        StructField("principal_id", StringType(), False),
        StructField("parent_principal_id", StringType(), True),
        StructField("parent_principal_display_name", StringType(), True),
        StructField("parent_principal_type", StringType(), True),
        StructField("last_updated_timestamp", TimestampType(), False)
    ])
    
    # Create DataFrame
    if not spp_details_data:
        logger.info("No service principal profiles found. Creating empty service principal profile details DataFrame.")
        empty_rdd = spark.sparkContext.emptyRDD()
        spp_df = spark.createDataFrame(empty_rdd, schema)
        return spp_df
    
    pandas_df = pd.DataFrame(spp_details_data)
    spark_df = spark.createDataFrame(pandas_df)
    enhanced_df = spark_df.withColumn("last_updated_timestamp", current_timestamp())
    
    logger.info(f"Created service principal profile details DataFrame with {enhanced_df.count()} records")
    return enhanced_df
# ==================================


# CELL 11 - Delta Lake Operations Functions
# ==================================
def ensure_delta_table_exists(table_name: str, df_schema):
    """
    Ensure the Delta table exists, creating it if necessary.
    
    Args:
        table_name: Name of the Delta table
        df_schema: Schema of the DataFrame
    """
    try:
        # Check if table exists
        spark.sql(f"DESCRIBE TABLE {table_name}")
        logger.info(f"Delta table '{table_name}' already exists")
    except Exception:
        # Table doesn't exist, create it
        logger.info(f"Creating Delta table '{table_name}'")
        
        # Create an empty DataFrame with the schema
        empty_df = spark.createDataFrame([], df_schema)
        
        # Create the Delta table
        empty_df.write \
            .mode("overwrite") \
            .option("overwriteSchema", "true") \
            .saveAsTable(table_name)
        
        logger.info(f"Delta table '{table_name}' created successfully")


def merge_core_access_data_to_delta(source_df, table_name: str):
    """
    Merge new core access data into the Delta table using MERGE operation.
    
    This function performs an upsert operation:
    - Updates existing records if workspace_id + principal_id matches
    - Inserts new records if the combination doesn't exist
    
    Args:
        source_df: DataFrame with new data
        table_name: Name of the target Delta table
    """
    logger.info(f"Starting merge operation for {table_name}")
    
    # Create a temporary view for the merge operation
    source_df.createOrReplaceTempView("access_updates")
    
    # If the table is empty, just insert all records
    if spark.table(table_name).count() == 0:
        logger.info(f"Table {table_name} is empty. Inserting all records.")
        source_df.write.mode("append").saveAsTable(table_name)
        return
    
    # Perform the merge operation (composite key: workspace_id + principal_id)
    merge_query = f"""
    MERGE INTO {table_name} AS target
    USING access_updates AS source
    ON target.workspace_id = source.workspace_id AND target.principal_id = source.principal_id
    WHEN MATCHED THEN
        UPDATE SET 
            target.principal_display_name = source.principal_display_name,
            target.principal_type = source.principal_type,
            target.workspace_role = source.workspace_role,
            target.workspace_type = source.workspace_type,
            target.last_updated_timestamp = source.last_updated_timestamp
    WHEN NOT MATCHED THEN
        INSERT *
    """
    
    spark.sql(merge_query)
    logger.info("Core access merge operation completed successfully")


def merge_principal_details_to_delta(source_df, table_name: str):
    """
    Merge principal-specific details into the Delta table using MERGE operation.
    
    This function performs an upsert operation based on principal_id.
    
    Args:
        source_df: DataFrame with new data
        table_name: Name of the target Delta table
    """
    logger.info(f"Starting merge operation for {table_name}")
    
    # Create a temporary view for the merge operation
    temp_view_name = f"{table_name.replace('fabric_', '')}_updates"
    source_df.createOrReplaceTempView(temp_view_name)
    
    # If the table is empty, just insert all records
    if spark.table(table_name).count() == 0:
        logger.info(f"Table {table_name} is empty. Inserting all records.")
        source_df.write.mode("append").saveAsTable(table_name)
        return
    
    # Get column names excluding principal_id and last_updated_timestamp for dynamic UPDATE SET
    columns = [col for col in source_df.columns if col not in ['principal_id', 'last_updated_timestamp']]
    update_set_clause = ", ".join([f"target.{col} = source.{col}" for col in columns])
    update_set_clause += ", target.last_updated_timestamp = source.last_updated_timestamp"
    
    # Perform the merge operation
    merge_query = f"""
    MERGE INTO {table_name} AS target
    USING {temp_view_name} AS source
    ON target.principal_id = source.principal_id
    WHEN MATCHED THEN
        UPDATE SET {update_set_clause}
    WHEN NOT MATCHED THEN
        INSERT *
    """
    
    spark.sql(merge_query)
    logger.info(f"Principal details merge operation completed successfully for {table_name}")


def optimize_delta_table(table_name: str):
    """
    Optimize the Delta table for better query performance.
    
    Args:
        table_name: Name of the Delta table to optimize
    """
    logger.info(f"Optimizing Delta table '{table_name}'")
    
    try:
        # Update table statistics for better query planning
        spark.sql(f"ANALYZE TABLE {table_name} COMPUTE STATISTICS")
        logger.info(f"Table statistics updated successfully for {table_name}")
        
        logger.info(f"Delta table optimization completed for {table_name}")
    except Exception as e:
        logger.warning(f"Table optimization step encountered an issue for {table_name}: {str(e)}")
        logger.info("Continuing with process - optimization is not critical for functionality")
# ==================================


# CELL 12 - Process Single Workspace Function
# ==================================
def process_single_workspace(access_token: str, workspace_id: str) -> Dict:
    """
    Process a single workspace: get access details, create DataFrames, and load to Delta tables.
    
    This function handles the complete workflow for one workspace:
    1. Call the API to get workspace access details
    2. Create DataFrames for different principal types
    3. Ensure Delta tables exist with proper schemas
    4. Merge data into Delta tables
    5. Optimize tables for performance
    
    Args:
        access_token: The Azure AD access token
        workspace_id: The workspace ID to process
    
    Returns:
        Dict: Summary of processing results including record counts
    
    Raises:
        Exception: If critical errors occur during processing
    """
    logger.info(f"=== PROCESSING WORKSPACE: {workspace_id} ===")
    processing_start_time = time.time()
    
    try:
        # Step 1: Retrieve workspace access details from API
        logger.info(f"Step 1: Retrieving access details for workspace {workspace_id}...")
        access_details = get_workspace_access_details(access_token, workspace_id)
        
        if not access_details:
            logger.warning(f"No access details found for workspace {workspace_id}. Skipping.")
            return {
                "workspace_id": workspace_id,
                "status": "skipped",
                "reason": "no_access_details",
                "processing_time_sec": time.time() - processing_start_time
            }
        
        # Step 2: Create DataFrames for different principal types
        logger.info(f"Step 2: Creating DataFrames for {len(access_details)} access details...")
        
        # Core access DataFrame (always created)
        core_access_df = create_core_access_dataframe(access_details, workspace_id)
        
        # Principal-specific DataFrames (only created if data exists)
        user_details_df = create_user_details_dataframe(access_details)
        group_details_df = create_group_details_dataframe(access_details)
        sp_details_df = create_service_principal_details_dataframe(access_details)
        spp_details_df = create_service_principal_profile_details_dataframe(access_details)
        
        # Step 3: Process each table (ensure exists, merge data, optimize)
        logger.info("Step 3: Processing Delta tables...")
        
        table_configs = [
            (CONFIG["ACCESS_CORE_TABLE_NAME"], core_access_df, merge_core_access_data_to_delta),
            (CONFIG["ACCESS_USERS_TABLE_NAME"], user_details_df, merge_principal_details_to_delta),
            (CONFIG["ACCESS_GROUPS_TABLE_NAME"], group_details_df, merge_principal_details_to_delta),
            (CONFIG["ACCESS_SERVICE_PRINCIPALS_TABLE_NAME"], sp_details_df, merge_principal_details_to_delta),
            (CONFIG["ACCESS_SERVICE_PRINCIPAL_PROFILES_TABLE_NAME"], spp_details_df, merge_principal_details_to_delta)
        ]
        
        record_counts = {}
        
        for table_name, df, merge_function in table_configs:
            logger.info(f"Processing table: {table_name}")
            
            # Ensure table exists
            ensure_delta_table_exists(table_name, df.schema)
            
            # Merge data if DataFrame has records
            df_count = df.count()
            record_counts[table_name] = df_count
            
            if df_count > 0:
                merge_function(df, table_name)
                logger.info(f"Successfully merged {df_count} records into {table_name}")
            else:
                logger.info(f"No data to merge for {table_name}")
            
            # Optimize the table (non-critical operation)
            optimize_delta_table(table_name)
        
        # Step 4: Log processing summary for this workspace
        processing_time = time.time() - processing_start_time
        logger.info(f"=== WORKSPACE {workspace_id} COMPLETED SUCCESSFULLY ===")
        logger.info(f"Processing time: {processing_time:.2f} seconds")
        logger.info(f"Record counts: {record_counts}")
        
        return {
            "workspace_id": workspace_id,
            "status": "success",
            "record_counts": record_counts,
            "processing_time_sec": processing_time
        }
        
    except Exception as e:
        processing_time = time.time() - processing_start_time
        logger.error(f"=== ERROR PROCESSING WORKSPACE {workspace_id} ===")
        logger.error(f"Error: {str(e)}")
        logger.error(f"Processing time before error: {processing_time:.2f} seconds")
        
        # Return error details but don't re-raise - let main function handle continuation
        return {
            "workspace_id": workspace_id,
            "status": "error",
            "error": str(e),
            "processing_time_sec": processing_time
        }
# ==================================


# CELL 13 - Main Execution Function
# ==================================
def main():
    """
    Main execution function that orchestrates the entire multi-workspace process.
    
    This function:
    1. Gets the authentication token
    2. Queries the lakehouse for workspace IDs to process
    3. Processes each workspace sequentially:
       - Calls the API for workspace access details
       - Creates DataFrames for different principal types
       - Loads data into Delta Lake tables using merge operations
       - Optimizes tables for analytics performance
    4. Provides comprehensive reporting on processing results
    
    The function processes workspaces one at a time to respect API rate limits
    and ensure data consistency. It continues processing even if individual
    workspaces encounter errors.
    
    Returns:
        List[Dict]: Summary of processing results for all workspaces
    """
    overall_start_time = time.time()
    logger.info("=== STARTING FABRIC WORKSPACE ACCESS DETAILS TO DELTA LAKE PROCESS ===")
    
    try:
        # Step 1: Get authentication token
        logger.info("Step 1: Getting access token...")
        access_token = get_access_token()
        logger.info("Successfully obtained access token")
        
        # Step 2: Get workspace IDs from lakehouse
        logger.info("Step 2: Querying lakehouse for workspace IDs...")
        workspace_ids = get_workspace_ids_from_lakehouse()
        
        if not workspace_ids:
            logger.error("No workspace IDs found. Exiting.")
            return []
        
        logger.info(f"Found {len(workspace_ids)} workspaces to process")
        
        # Step 3: Process each workspace sequentially
        logger.info("Step 3: Processing workspaces sequentially...")
        processing_results = []
        successful_workspaces = 0
        failed_workspaces = 0
        skipped_workspaces = 0
        
        for i, workspace_id in enumerate(workspace_ids, 1):
            logger.info(f"\n--- Processing workspace {i} of {len(workspace_ids)}: {workspace_id} ---")
            
            try:
                # Process the workspace
                result = process_single_workspace(access_token, workspace_id)
                processing_results.append(result)
                
                # Track results
                if result["status"] == "success":
                    successful_workspaces += 1
                elif result["status"] == "error":
                    failed_workspaces += 1
                elif result["status"] == "skipped":
                    skipped_workspaces += 1
                
                # Rate limiting: Wait between API calls to respect the 200 requests/hour limit
                # Only wait if this is not the last workspace
                if i < len(workspace_ids):
                    logger.info(f"Waiting {CONFIG['INTER_WORKSPACE_DELAY_SEC']} seconds before next workspace (rate limiting)...")
                    time.sleep(CONFIG['INTER_WORKSPACE_DELAY_SEC'])
                
            except Exception as e:
                # Log the error but continue with next workspace
                logger.error(f"Unexpected error processing workspace {workspace_id}: {str(e)}")
                failed_workspaces += 1
                processing_results.append({
                    "workspace_id": workspace_id,
                    "status": "error",
                    "error": f"Unexpected error: {str(e)}",
                    "processing_time_sec": 0
                })
        
        # Step 4: Generate comprehensive final report
        overall_processing_time = time.time() - overall_start_time
        logger.info("\n" + "="*80)
        logger.info("=== FINAL PROCESSING REPORT ===")
        logger.info("="*80)
        logger.info(f"Total workspaces processed: {len(workspace_ids)}")
        logger.info(f"Successful: {successful_workspaces}")
        logger.info(f"Failed: {failed_workspaces}")
        logger.info(f"Skipped: {skipped_workspaces}")
        logger.info(f"Overall processing time: {overall_processing_time:.2f} seconds ({overall_processing_time/60:.1f} minutes)")
        
        # Detailed results by status
        if successful_workspaces > 0:
            logger.info(f"\n=== SUCCESSFUL WORKSPACES ({successful_workspaces}) ===")
            successful_results = [r for r in processing_results if r["status"] == "success"]
            for result in successful_results:
                total_records = sum(result.get("record_counts", {}).values())
                logger.info(f"  {result['workspace_id']}: {total_records} total records, {result['processing_time_sec']:.1f}s")
        
        if failed_workspaces > 0:
            logger.info(f"\n=== FAILED WORKSPACES ({failed_workspaces}) ===")
            failed_results = [r for r in processing_results if r["status"] == "error"]
            for result in failed_results:
                logger.info(f"  {result['workspace_id']}: {result.get('error', 'Unknown error')}")
        
        if skipped_workspaces > 0:
            logger.info(f"\n=== SKIPPED WORKSPACES ({skipped_workspaces}) ===")
            skipped_results = [r for r in processing_results if r["status"] == "skipped"]
            for result in skipped_results:
                logger.info(f"  {result['workspace_id']}: {result.get('reason', 'Unknown reason')}")
        
        # Step 5: Display final table statistics and analytics
        logger.info("\n=== FINAL TABLE STATISTICS ===")
        table_names = [
            CONFIG["ACCESS_CORE_TABLE_NAME"],
            CONFIG["ACCESS_USERS_TABLE_NAME"],
            CONFIG["ACCESS_GROUPS_TABLE_NAME"],
            CONFIG["ACCESS_SERVICE_PRINCIPALS_TABLE_NAME"],
            CONFIG["ACCESS_SERVICE_PRINCIPAL_PROFILES_TABLE_NAME"]
        ]
        
        for table_name in table_names:
            try:
                row_count = spark.table(table_name).count()
                logger.info(f"  {table_name}: {row_count:,} total rows")
            except Exception as e:
                logger.warning(f"  {table_name}: Could not get count - {str(e)}")
        
        # Step 6: Sample analytics queries
        logger.info("\n=== SAMPLE ANALYTICS ===")
        try:
            # Overall summary across all workspaces
            overall_summary = spark.sql(f"""
                SELECT 
                    COUNT(DISTINCT workspace_id) as total_workspaces_with_access,
                    COUNT(*) as total_access_grants,
                    COUNT(DISTINCT principal_id) as unique_principals,
                    COUNT(DISTINCT principal_type) as unique_principal_types
                FROM {CONFIG["ACCESS_CORE_TABLE_NAME"]}
            """)
            
            logger.info("Overall summary across all processed workspaces:")
            overall_summary.show(truncate=False)
            
            # Principal type distribution
            principal_type_summary = spark.sql(f"""
                SELECT 
                    principal_type,
                    workspace_role,
                    COUNT(*) as count,
                    COUNT(DISTINCT workspace_id) as workspaces_count
                FROM {CONFIG["ACCESS_CORE_TABLE_NAME"]}
                GROUP BY principal_type, workspace_role
                ORDER BY principal_type, workspace_role
            """)
            
            logger.info("Principal type and role distribution:")
            principal_type_summary.show(truncate=False)
            
        except Exception as e:
            logger.warning(f"Could not generate final analytics: {str(e)}")
        
        logger.info("="*80)
        logger.info("=== PROCESS COMPLETED ===")
        logger.info("="*80)
        
        return processing_results
        
    except Exception as e:
        overall_processing_time = time.time() - overall_start_time
        logger.error(f"Critical error in main execution: {str(e)}")
        logger.error(f"Total processing time before error: {overall_processing_time:.2f} seconds")
        raise
# ==================================


# CELL 14 - Execute Main Function
# ==================================
# Execute the main function
if __name__ == "__main__":
    # Run the main process to handle all workspaces
    results = main()
    
    # Optional: Store results for further analysis
    # You can uncomment the following lines to save results to a DataFrame
    # if results:
    #     results_df = spark.createDataFrame(pd.DataFrame(results))
    #     results_df.write.mode("overwrite").saveAsTable("fabric_workspace_access_processing_results")
    #     logger.info("Processing results saved to table: fabric_workspace_access_processing_results")
# ==================================

StatementMeta(, 19f02840-53b3-4187-a034-d02b3e514cb5, 10, Finished, Available, Finished)

2025-07-16 17:46:04,668 - INFO - Configuration initialized successfully
2025-07-16 17:46:04,672 - INFO - === STARTING FABRIC WORKSPACE ACCESS DETAILS TO DELTA LAKE PROCESS ===
2025-07-16 17:46:04,673 - INFO - Step 1: Getting access token...
2025-07-16 17:46:04,678 - INFO - Successfully obtained access token
2025-07-16 17:46:04,678 - INFO - Step 2: Querying lakehouse for workspace IDs...
2025-07-16 17:46:04,679 - INFO - Querying lakehouse for workspace IDs...
2025-07-16 17:46:07,692 - INFO - Found 33 workspaces to process
2025-07-16 17:46:07,692 - INFO - First few workspace IDs: ['3dc16785-8298-4567-880a-9919f074555f', '6cf9f4f0-0f1a-4514-82d6-ac6eb63f40ba', '122d675e-1ec9-497d-9709-7084a5bcf3ae', '263575b7-d1ee-4dcc-bd05-ca67158e1bb4', 'cd9d5418-e35d-400e-938d-ea91ea486e07']
2025-07-16 17:46:07,693 - INFO - Found 33 workspaces to process
2025-07-16 17:46:07,694 - INFO - Step 3: Processing workspaces sequentially...
2025-07-16 17:46:07,694 - INFO - 
--- Processing workspace 1 of 33: 3dc

+----------------------------+-------------------+-----------------+----------------------+
|total_workspaces_with_access|total_access_grants|unique_principals|unique_principal_types|
+----------------------------+-------------------+-----------------+----------------------+
|36                          |468                |280              |3                     |
+----------------------------+-------------------+-----------------+----------------------+

+----------------+--------------+-----+----------------+
|principal_type  |workspace_role|count|workspaces_count|
+----------------+--------------+-----+----------------+
|Group           |Admin         |66   |32              |
|Group           |Contributor   |1    |1               |
|Group           |Member        |3    |3               |
|Group           |Viewer        |8    |4               |
|ServicePrincipal|Admin         |1    |1               |
|ServicePrincipal|Contributor   |3    |1               |
|User            |Admin   

In [9]:
from pyspark.sql import SparkSession

# create Spark session
spark = SparkSession.builder.appName("Refresh SQL Endpoint Metadata").getOrCreate()

# refresh the specific table
spark.sql("REFRESH TABLE fabric_workspaces_access_core")
print("Metadata refresh triggered successfully.")

# refresh the specific table
spark.sql("REFRESH TABLE fabric_workspaces_access_users")
print("Metadata refresh triggered successfully.")

# refresh the specific table
spark.sql("REFRESH TABLE fabric_workspaces_access_groups")
print("Metadata refresh triggered successfully.")

# refresh the specific table
spark.sql("REFRESH TABLE fabric_workspaces_access_service_principals")
print("Metadata refresh triggered successfully.")

# refresh the specific table
spark.sql("REFRESH TABLE fabric_workspaces_access_service_principal_profiles")
print("Metadata refresh triggered successfully.")


StatementMeta(, 19f02840-53b3-4187-a034-d02b3e514cb5, 11, Finished, Available, Finished)

Metadata refresh triggered successfully.
Metadata refresh triggered successfully.
Metadata refresh triggered successfully.
Metadata refresh triggered successfully.
Metadata refresh triggered successfully.
