In [None]:
# DO NOT DELETE THIS CELL

# ***IMPORTANT*** this is a work in progress! (WIP)  This is only running the API against a single workspace for testing purposes. 

# API Name: Workspaces - List Workspace Access Details
# Command:  GET https://api.fabric.microsoft.com/v1/admin/workspaces/{workspaceId}/users
# Doc:      https://learn.microsoft.com/en-us/rest/api/fabric/admin/workspaces/list-workspace-access-details

# Loads tables: fabric_workspaces_access_core
# Loads tables: fabric_workspaces_access_users
# Loads tables: fabric_workspaces_access_groups
# Loads tables: fabric_workspaces_access_service_principals
# Loads tables: fabric_workspaces_access_service_principal_profiles

In [1]:
# CELL 1 - Title and Introduction
# ==================================
# Microsoft Fabric Workspace Access Details to Delta Lake - PySpark Notebook
# This notebook retrieves Microsoft Fabric workspace access details and loads them into Delta Lake tables
# with optimization for analytics workloads using a multi-table approach for different principal types
# ==================================


# CELL 2 - Import Libraries
# ==================================
# Import required libraries
import requests
import json
import pandas as pd
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, current_timestamp, lit, from_json, when, isnotnull
from pyspark.sql.types import StructType, StructField, StringType, TimestampType
import logging
from typing import Dict, List, Optional
from delta.tables import DeltaTable
import random
import time
# ==================================


# CELL 3 - Configure Logging and Initialize Spark
# ==================================
# Configure logging
# This helps us track what's happening in our code and debug issues
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)

# Initialize Spark Session with Delta Lake configurations
# In Fabric notebooks, Spark is pre-configured with Delta support
spark = SparkSession.builder \
    .appName("FabricWorkspaceAccessDetailsToDelta") \
    .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension") \
    .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog") \
    .getOrCreate()

# Set optimal configurations for Delta operations
spark.conf.set("spark.databricks.delta.optimizeWrite.enabled", "true")
spark.conf.set("spark.databricks.delta.autoCompact.enabled", "true")
# ==================================


# CELL 4 - Configuration Parameters
# ==================================
# Configuration Parameters
# These are the settings we'll use throughout the notebook
CONFIG = {
    "API_BASE_URL": "https://api.fabric.microsoft.com/v1",
    "WORKSPACE_ACCESS_ENDPOINT": "/admin/workspaces/{workspaceId}/users",  # Endpoint for workspace access details
    "MAX_RETRIES": 5,  # Increased number of retries for handling rate limits
    "INITIAL_BACKOFF_SEC": 1,  # Initial backoff time in seconds
    "MAX_BACKOFF_SEC": 60,  # Maximum backoff time in seconds
    "BACKOFF_FACTOR": 2,  # Exponential backoff multiplier
    "JITTER_FACTOR": 0.1,  # Random jitter to add to backoff (as a fraction)
    "TIMEOUT": 30,  # API request timeout in seconds
    # Multi-table approach for different principal types
    "ACCESS_CORE_TABLE_NAME": "fabric_workspaces_access_core",  # Core access table
    "ACCESS_USERS_TABLE_NAME": "fabric_workspaces_access_users",  # User details table
    "ACCESS_GROUPS_TABLE_NAME": "fabric_workspaces_access_groups",  # Group details table
    "ACCESS_SERVICE_PRINCIPALS_TABLE_NAME": "fabric_workspaces_access_service_principals",  # Service principal details table
    "ACCESS_SERVICE_PRINCIPAL_PROFILES_TABLE_NAME": "fabric_workspaces_access_service_principal_profiles",  # Service principal profile details table
    "LAKEHOUSE_PATH": "Tables",  # Default Tables folder in Fabric Lakehouse
    "DEBUG_MODE": True,  # Set to True to enable extra debugging output
    # Test workspace ID parameter - CHANGE THIS FOR TESTING
    "TEST_WORKSPACE_ID": "7a21dc44-c8b8-446e-9e80-59458a88ece8"  # Replace with actual workspace ID for testing
}

# IMPORTANT: Update the TEST_WORKSPACE_ID above with a real workspace ID for testing
# You can get workspace IDs from the workspaces API or from the Fabric portal
# ==================================


# CELL 5 - Authentication Function
# ==================================
def get_access_token():
    """
    Get Azure AD access token for Fabric API authentication.
    
    In a Fabric notebook, the token is automatically available through mssparkutils.
    This function retrieves the token that's needed to authenticate with the Fabric REST API.
    
    Returns:
        str: The access token
    
    Note:
        mssparkutils is a utility library provided by Microsoft Fabric
        that handles authentication automatically.
    """
    try:
        # In Fabric notebooks, we can get the token using mssparkutils
        from notebookutils import mssparkutils
        token_response = mssparkutils.credentials.getToken("https://api.fabric.microsoft.com")
        return token_response
    except Exception as e:
        logger.error(f"Failed to get access token: {str(e)}")
        raise
# ==================================


# CELL 6 - API Call Function
# ==================================
def call_fabric_api(endpoint: str, access_token: str, params: Optional[Dict] = None) -> Dict:
    """
    Make a REST API call to Microsoft Fabric with advanced rate limit handling.
    
    This function handles the HTTP request to the Fabric API, including:
    - Setting up authentication headers
    - Managing retries with intelligent backoff for rate limiting (429 errors)
    - Implementing jitter to avoid synchronized retries
    - Detailed error handling and logging
    
    Args:
        endpoint: The API endpoint path (e.g., "/admin/workspaces/{workspaceId}/users")
        access_token: The Azure AD access token
        params: Optional query parameters for the API call
    
    Returns:
        dict: The JSON response from the API
    
    Raises:
        requests.exceptions.RequestException: If the API call fails after all retries
    """
    url = f"{CONFIG['API_BASE_URL']}{endpoint}"
    headers = {
        "Authorization": f"Bearer {access_token}",
        "Content-Type": "application/json"
    }
    
    # Initialize backoff time
    backoff_time = CONFIG['INITIAL_BACKOFF_SEC']
    
    # Retry logic with intelligent backoff
    for attempt in range(CONFIG['MAX_RETRIES']):
        try:
            # Log the full URL with parameters for debugging
            logger.info(f"Making API call to: {url} with params: {params} (Attempt {attempt + 1})")
            
            response = requests.get(
                url,
                headers=headers,
                params=params,
                timeout=CONFIG['TIMEOUT']
            )
            
            # Log the full response for debugging
            logger.info(f"Response status: {response.status_code}")
            
            # Rate limit handling (429 Too Many Requests)
            if response.status_code == 429:
                # Get retry-after header if available, otherwise use our backoff
                retry_after = response.headers.get('Retry-After')
                
                if retry_after and retry_after.isdigit():
                    # If server specified a wait time, use it
                    wait_time = int(retry_after)
                else:
                    # Calculate wait time with exponential backoff and jitter
                    jitter = random.uniform(0, CONFIG['JITTER_FACTOR'] * backoff_time)
                    wait_time = backoff_time + jitter
                    # Update backoff for next attempt
                    backoff_time = min(backoff_time * CONFIG['BACKOFF_FACTOR'], CONFIG['MAX_BACKOFF_SEC'])
                
                logger.warning(f"Rate limit exceeded (429). Waiting {wait_time:.2f} seconds before retry.")
                time.sleep(wait_time)
                continue  # Skip to next retry attempt without raising exception
            
            # Log the response for debugging in case of errors
            if response.status_code >= 400:
                logger.error(f"API error: Status {response.status_code}, Response: {response.text}")
                logger.error(f"Request URL: {response.request.url}")
                logger.error(f"Request headers: {response.request.headers}")
            
            # For all other status codes, use raise_for_status to handle
            response.raise_for_status()
            
            # If we get here, the request was successful
            # Reset backoff for next API call (not next attempt)
            backoff_time = CONFIG['INITIAL_BACKOFF_SEC']
            
            # Log a sample of the response content for debugging
            try:
                response_json = response.json()
                if "accessDetails" in response_json and isinstance(response_json["accessDetails"], list):
                    logger.info(f"Response contains {len(response_json['accessDetails'])} items in 'accessDetails' array")
                return response_json
            except json.JSONDecodeError as e:
                logger.error(f"Failed to parse response as JSON: {str(e)}")
                logger.error(f"Response content: {response.text[:1000]}")  # Log first 1000 chars of response
                raise
            
        except requests.exceptions.RequestException as e:
            last_attempt = attempt == CONFIG['MAX_RETRIES'] - 1
            
            # Special handling for non-429 errors
            if not (hasattr(e, 'response') and e.response is not None and e.response.status_code == 429):
                logger.warning(f"API call failed (Attempt {attempt + 1}): {str(e)}")
                
                if last_attempt:
                    logger.error(f"All retry attempts failed for endpoint: {endpoint}")
                    logger.error(f"Final error: {str(e)}")
                    raise
                
                # Calculate wait time with exponential backoff and jitter
                jitter = random.uniform(0, CONFIG['JITTER_FACTOR'] * backoff_time)
                wait_time = backoff_time + jitter
                # Update backoff for next attempt
                backoff_time = min(backoff_time * CONFIG['BACKOFF_FACTOR'], CONFIG['MAX_BACKOFF_SEC'])
                
                logger.info(f"Waiting {wait_time:.2f} seconds before retry.")
                time.sleep(wait_time)
# ==================================


# CELL 7 - Get Workspace Access Details Function
# ==================================
def get_workspace_access_details(access_token: str, workspace_id: str) -> List[Dict]:
    """
    Retrieve workspace access details for a specific workspace from the Fabric API.
    
    This function makes a request to the List Workspace Access Details API endpoint
    to get all users, groups, and service principals that have access to the workspace.
    
    Args:
        access_token: The Azure AD access token
        workspace_id: The workspace ID to get access details for
    
    Returns:
        list: A list of all access detail objects containing principal and workspace access information
    """
    logger.info(f"Retrieving access details for workspace: {workspace_id}")
    
    # Construct the endpoint with the workspace ID
    endpoint = CONFIG['WORKSPACE_ACCESS_ENDPOINT'].format(workspaceId=workspace_id)
    
    try:
        # Make the API call
        response_data = call_fabric_api(endpoint, access_token)
        
        # Log the response structure for debugging
        if CONFIG['DEBUG_MODE']:
            logger.info(f"Response keys: {list(response_data.keys())}")
        
        # Extract access details from the response
        access_details = response_data.get("accessDetails", [])
        
        if access_details:
            logger.info(f"Retrieved {len(access_details)} access details for workspace {workspace_id}")
            
            # Log first access detail for debugging
            if CONFIG['DEBUG_MODE'] and access_details:
                logger.info(f"Sample access detail: {json.dumps(access_details[0], indent=2)}")
        else:
            logger.warning(f"No access details found for workspace {workspace_id}")
        
        return access_details
        
    except requests.exceptions.RequestException as e:
        logger.error(f"API call failed for workspace {workspace_id}: {str(e)}")
        raise
# ==================================


# CELL 8 - Create Core Access DataFrame Function
# ==================================
def create_core_access_dataframe(access_details: List[Dict], workspace_id: str) -> "DataFrame":
    """
    Convert the access details data into a core access PySpark DataFrame.
    
    This function creates the main access table containing the core information
    that's common across all principal types.
    
    Args:
        access_details: List of access detail dictionaries from the API
        workspace_id: The workspace ID this data relates to
    
    Returns:
        DataFrame: A PySpark DataFrame with core access information
    """
    logger.info("Creating core access DataFrame")
    
    # Extract core access information from each access detail
    core_access_data = []
    
    for access_detail in access_details:
        principal = access_detail.get("principal", {})
        workspace_access = access_detail.get("workspaceAccessDetails", {})
        
        core_record = {
            "workspace_id": workspace_id,
            "principal_id": principal.get("id"),
            "principal_display_name": principal.get("displayName"),
            "principal_type": principal.get("type"),
            "workspace_role": workspace_access.get("workspaceRole"),
            "workspace_type": workspace_access.get("type")
        }
        core_access_data.append(core_record)
    
    # Define the schema for the core access table
    schema = StructType([
        StructField("workspace_id", StringType(), False),
        StructField("principal_id", StringType(), False),
        StructField("principal_display_name", StringType(), True),
        StructField("principal_type", StringType(), False),
        StructField("workspace_role", StringType(), True),
        StructField("workspace_type", StringType(), True),
        StructField("last_updated_timestamp", TimestampType(), False)
    ])
    
    # Create DataFrame
    if not core_access_data:
        logger.warning("No access details found. Creating empty core DataFrame.")
        empty_rdd = spark.sparkContext.emptyRDD()
        core_df = spark.createDataFrame(empty_rdd, schema)
        return core_df
    
    # Convert to pandas DataFrame first, then to Spark DataFrame
    pandas_df = pd.DataFrame(core_access_data)
    
    # Create the initial Spark DataFrame
    required_columns = ["workspace_id", "principal_id", "principal_display_name", "principal_type", "workspace_role", "workspace_type"]
    
    # Ensure all columns exist in the pandas DataFrame
    for col_name in required_columns:
        if col_name not in pandas_df.columns:
            pandas_df[col_name] = None
    
    spark_df = spark.createDataFrame(pandas_df[required_columns])
    
    # Add metadata column for tracking when this data was extracted
    enhanced_df = spark_df.withColumn("last_updated_timestamp", current_timestamp())
    
    logger.info(f"Created core access DataFrame with {enhanced_df.count()} records")
    return enhanced_df
# ==================================


# CELL 9 - Create Principal-Specific DataFrames Functions
# ==================================
def create_user_details_dataframe(access_details: List[Dict]) -> "DataFrame":
    """
    Create a DataFrame specifically for user principal details.
    
    Args:
        access_details: List of access detail dictionaries from the API
    
    Returns:
        DataFrame: A PySpark DataFrame with user-specific details
    """
    logger.info("Creating user details DataFrame")
    
    user_details_data = []
    
    for access_detail in access_details:
        principal = access_detail.get("principal", {})
        
        # Only process User principals
        if principal.get("type") == "User":
            user_details = principal.get("userDetails", {})
            
            user_record = {
                "principal_id": principal.get("id"),
                "user_principal_name": user_details.get("userPrincipalName")
            }
            user_details_data.append(user_record)
    
    # Define schema
    schema = StructType([
        StructField("principal_id", StringType(), False),
        StructField("user_principal_name", StringType(), True),
        StructField("last_updated_timestamp", TimestampType(), False)
    ])
    
    # Create DataFrame
    if not user_details_data:
        logger.info("No user principals found. Creating empty user details DataFrame.")
        empty_rdd = spark.sparkContext.emptyRDD()
        user_df = spark.createDataFrame(empty_rdd, schema)
        return user_df
    
    pandas_df = pd.DataFrame(user_details_data)
    spark_df = spark.createDataFrame(pandas_df)
    enhanced_df = spark_df.withColumn("last_updated_timestamp", current_timestamp())
    
    logger.info(f"Created user details DataFrame with {enhanced_df.count()} records")
    return enhanced_df


def create_group_details_dataframe(access_details: List[Dict]) -> "DataFrame":
    """
    Create a DataFrame specifically for group principal details.
    
    Args:
        access_details: List of access detail dictionaries from the API
    
    Returns:
        DataFrame: A PySpark DataFrame with group-specific details
    """
    logger.info("Creating group details DataFrame")
    
    group_details_data = []
    
    for access_detail in access_details:
        principal = access_detail.get("principal", {})
        
        # Only process Group principals
        if principal.get("type") == "Group":
            group_details = principal.get("groupDetails", {})
            
            group_record = {
                "principal_id": principal.get("id"),
                "group_type": group_details.get("groupType")
            }
            group_details_data.append(group_record)
    
    # Define schema
    schema = StructType([
        StructField("principal_id", StringType(), False),
        StructField("group_type", StringType(), True),
        StructField("last_updated_timestamp", TimestampType(), False)
    ])
    
    # Create DataFrame
    if not group_details_data:
        logger.info("No group principals found. Creating empty group details DataFrame.")
        empty_rdd = spark.sparkContext.emptyRDD()
        group_df = spark.createDataFrame(empty_rdd, schema)
        return group_df
    
    pandas_df = pd.DataFrame(group_details_data)
    spark_df = spark.createDataFrame(pandas_df)
    enhanced_df = spark_df.withColumn("last_updated_timestamp", current_timestamp())
    
    logger.info(f"Created group details DataFrame with {enhanced_df.count()} records")
    return enhanced_df


def create_service_principal_details_dataframe(access_details: List[Dict]) -> "DataFrame":
    """
    Create a DataFrame specifically for service principal details.
    
    Args:
        access_details: List of access detail dictionaries from the API
    
    Returns:
        DataFrame: A PySpark DataFrame with service principal-specific details
    """
    logger.info("Creating service principal details DataFrame")
    
    sp_details_data = []
    
    for access_detail in access_details:
        principal = access_detail.get("principal", {})
        
        # Only process ServicePrincipal principals
        if principal.get("type") == "ServicePrincipal":
            sp_details = principal.get("servicePrincipalDetails", {})
            
            sp_record = {
                "principal_id": principal.get("id"),
                "aad_app_id": sp_details.get("aadAppId")
            }
            sp_details_data.append(sp_record)
    
    # Define schema
    schema = StructType([
        StructField("principal_id", StringType(), False),
        StructField("aad_app_id", StringType(), True),
        StructField("last_updated_timestamp", TimestampType(), False)
    ])
    
    # Create DataFrame
    if not sp_details_data:
        logger.info("No service principals found. Creating empty service principal details DataFrame.")
        empty_rdd = spark.sparkContext.emptyRDD()
        sp_df = spark.createDataFrame(empty_rdd, schema)
        return sp_df
    
    pandas_df = pd.DataFrame(sp_details_data)
    spark_df = spark.createDataFrame(pandas_df)
    enhanced_df = spark_df.withColumn("last_updated_timestamp", current_timestamp())
    
    logger.info(f"Created service principal details DataFrame with {enhanced_df.count()} records")
    return enhanced_df


def create_service_principal_profile_details_dataframe(access_details: List[Dict]) -> "DataFrame":
    """
    Create a DataFrame specifically for service principal profile details.
    
    Args:
        access_details: List of access detail dictionaries from the API
    
    Returns:
        DataFrame: A PySpark DataFrame with service principal profile-specific details
    """
    logger.info("Creating service principal profile details DataFrame")
    
    spp_details_data = []
    
    for access_detail in access_details:
        principal = access_detail.get("principal", {})
        
        # Only process ServicePrincipalProfile principals
        if principal.get("type") == "ServicePrincipalProfile":
            spp_details = principal.get("servicePrincipalProfileDetails", {})
            parent_principal = spp_details.get("parentPrincipal", {})
            
            spp_record = {
                "principal_id": principal.get("id"),
                "parent_principal_id": parent_principal.get("id"),
                "parent_principal_display_name": parent_principal.get("displayName"),
                "parent_principal_type": parent_principal.get("type")
            }
            spp_details_data.append(spp_record)
    
    # Define schema
    schema = StructType([
        StructField("principal_id", StringType(), False),
        StructField("parent_principal_id", StringType(), True),
        StructField("parent_principal_display_name", StringType(), True),
        StructField("parent_principal_type", StringType(), True),
        StructField("last_updated_timestamp", TimestampType(), False)
    ])
    
    # Create DataFrame
    if not spp_details_data:
        logger.info("No service principal profiles found. Creating empty service principal profile details DataFrame.")
        empty_rdd = spark.sparkContext.emptyRDD()
        spp_df = spark.createDataFrame(empty_rdd, schema)
        return spp_df
    
    pandas_df = pd.DataFrame(spp_details_data)
    spark_df = spark.createDataFrame(pandas_df)
    enhanced_df = spark_df.withColumn("last_updated_timestamp", current_timestamp())
    
    logger.info(f"Created service principal profile details DataFrame with {enhanced_df.count()} records")
    return enhanced_df
# ==================================


# CELL 10 - Delta Lake Operations Functions
# ==================================
def ensure_delta_table_exists(table_name: str, df_schema):
    """
    Ensure the Delta table exists, creating it if necessary.
    
    Args:
        table_name: Name of the Delta table
        df_schema: Schema of the DataFrame
    """
    try:
        # Check if table exists
        spark.sql(f"DESCRIBE TABLE {table_name}")
        logger.info(f"Delta table '{table_name}' already exists")
    except Exception:
        # Table doesn't exist, create it
        logger.info(f"Creating Delta table '{table_name}'")
        
        # Create an empty DataFrame with the schema
        empty_df = spark.createDataFrame([], df_schema)
        
        # Create the Delta table
        empty_df.write \
            .mode("overwrite") \
            .option("overwriteSchema", "true") \
            .saveAsTable(table_name)
        
        logger.info(f"Delta table '{table_name}' created successfully")


def merge_core_access_data_to_delta(source_df, table_name: str):
    """
    Merge new core access data into the Delta table using MERGE operation.
    
    This function performs an upsert operation:
    - Updates existing records if workspace_id + principal_id matches
    - Inserts new records if the combination doesn't exist
    
    Args:
        source_df: DataFrame with new data
        table_name: Name of the target Delta table
    """
    logger.info(f"Starting merge operation for {table_name}")
    
    # Create a temporary view for the merge operation
    source_df.createOrReplaceTempView("access_updates")
    
    # If the table is empty, just insert all records
    if spark.table(table_name).count() == 0:
        logger.info(f"Table {table_name} is empty. Inserting all records.")
        source_df.write.mode("append").saveAsTable(table_name)
        return
    
    # Perform the merge operation (composite key: workspace_id + principal_id)
    merge_query = f"""
    MERGE INTO {table_name} AS target
    USING access_updates AS source
    ON target.workspace_id = source.workspace_id AND target.principal_id = source.principal_id
    WHEN MATCHED THEN
        UPDATE SET 
            target.principal_display_name = source.principal_display_name,
            target.principal_type = source.principal_type,
            target.workspace_role = source.workspace_role,
            target.workspace_type = source.workspace_type,
            target.last_updated_timestamp = source.last_updated_timestamp
    WHEN NOT MATCHED THEN
        INSERT *
    """
    
    spark.sql(merge_query)
    logger.info("Core access merge operation completed successfully")


def merge_principal_details_to_delta(source_df, table_name: str):
    """
    Merge principal-specific details into the Delta table using MERGE operation.
    
    This function performs an upsert operation based on principal_id.
    
    Args:
        source_df: DataFrame with new data
        table_name: Name of the target Delta table
    """
    logger.info(f"Starting merge operation for {table_name}")
    
    # Create a temporary view for the merge operation
    temp_view_name = f"{table_name.replace('fabric_', '')}_updates"
    source_df.createOrReplaceTempView(temp_view_name)
    
    # If the table is empty, just insert all records
    if spark.table(table_name).count() == 0:
        logger.info(f"Table {table_name} is empty. Inserting all records.")
        source_df.write.mode("append").saveAsTable(table_name)
        return
    
    # Get column names excluding principal_id and last_updated_timestamp for dynamic UPDATE SET
    columns = [col for col in source_df.columns if col not in ['principal_id', 'last_updated_timestamp']]
    update_set_clause = ", ".join([f"target.{col} = source.{col}" for col in columns])
    update_set_clause += ", target.last_updated_timestamp = source.last_updated_timestamp"
    
    # Perform the merge operation
    merge_query = f"""
    MERGE INTO {table_name} AS target
    USING {temp_view_name} AS source
    ON target.principal_id = source.principal_id
    WHEN MATCHED THEN
        UPDATE SET {update_set_clause}
    WHEN NOT MATCHED THEN
        INSERT *
    """
    
    spark.sql(merge_query)
    logger.info(f"Principal details merge operation completed successfully for {table_name}")


def optimize_delta_table(table_name: str):
    """
    Optimize the Delta table for better query performance.
    
    Args:
        table_name: Name of the Delta table to optimize
    """
    logger.info(f"Optimizing Delta table '{table_name}'")
    
    try:
        # Update table statistics for better query planning
        spark.sql(f"ANALYZE TABLE {table_name} COMPUTE STATISTICS")
        logger.info(f"Table statistics updated successfully for {table_name}")
        
        logger.info(f"Delta table optimization completed for {table_name}")
    except Exception as e:
        logger.warning(f"Table optimization step encountered an issue for {table_name}: {str(e)}")
        logger.info("Continuing with process - optimization is not critical for functionality")
# ==================================


# CELL 11 - Main Execution Function
# ==================================
def main(workspace_id: str = None):
    """
    Main execution function that orchestrates the entire process.
    
    This function:
    1. Gets the authentication token
    2. Retrieves workspace access details from the API
    3. Creates multiple DataFrames for different principal types
    4. Loads data into Delta Lake tables using a multi-table approach
    5. Optimizes the tables for analytics
    
    Args:
        workspace_id: The workspace ID to process. If None, uses the test workspace ID from config.
    """
    try:
        # Use provided workspace_id or fall back to test config
        target_workspace_id = workspace_id or CONFIG["TEST_WORKSPACE_ID"]
        
        if target_workspace_id == "your-workspace-id-here":
            raise ValueError("Please update the TEST_WORKSPACE_ID in CONFIG or provide a workspace_id parameter")
        
        logger.info(f"Starting Fabric Workspace Access Details to Delta Lake process for workspace: {target_workspace_id}")
        
        # Step 1: Get authentication token
        logger.info("Getting access token...")
        access_token = get_access_token()
        logger.info("Successfully obtained access token")
        
        # Step 2: Retrieve workspace access details
        logger.info(f"Retrieving access details for workspace {target_workspace_id}...")
        access_details = get_workspace_access_details(access_token, target_workspace_id)
        
        if not access_details:
            logger.warning("No access details found. Please check your permissions and workspace ID.")
            return None
        
        # Step 3: Create DataFrames for different principal types
        logger.info("Creating DataFrames for different principal types...")
        
        # Core access DataFrame (always created)
        core_access_df = create_core_access_dataframe(access_details, target_workspace_id)
        
        # Principal-specific DataFrames (only created if data exists)
        user_details_df = create_user_details_dataframe(access_details)
        group_details_df = create_group_details_dataframe(access_details)
        sp_details_df = create_service_principal_details_dataframe(access_details)
        spp_details_df = create_service_principal_profile_details_dataframe(access_details)
        
        # Show sample data
        logger.info("Sample of core access data:")
        core_access_df.show(5, truncate=False)
        
        # Step 4: Prepare and load Delta tables
        table_configs = [
            (CONFIG["ACCESS_CORE_TABLE_NAME"], core_access_df, merge_core_access_data_to_delta),
            (CONFIG["ACCESS_USERS_TABLE_NAME"], user_details_df, merge_principal_details_to_delta),
            (CONFIG["ACCESS_GROUPS_TABLE_NAME"], group_details_df, merge_principal_details_to_delta),
            (CONFIG["ACCESS_SERVICE_PRINCIPALS_TABLE_NAME"], sp_details_df, merge_principal_details_to_delta),
            (CONFIG["ACCESS_SERVICE_PRINCIPAL_PROFILES_TABLE_NAME"], spp_details_df, merge_principal_details_to_delta)
        ]
        
        for table_name, df, merge_function in table_configs:
            logger.info(f"Processing table: {table_name}")
            
            # Ensure table exists
            ensure_delta_table_exists(table_name, df.schema)
            
            # Merge data if DataFrame has records
            if df.count() > 0:
                merge_function(df, table_name)
                logger.info(f"Successfully loaded data into {table_name}")
            else:
                logger.info(f"No data to load for {table_name}")
            
            # Optimize the table
            optimize_delta_table(table_name)
        
        # Step 5: Display final statistics
        logger.info("Loading completed successfully!")
        
        # Show statistics for each table
        for table_name, df, _ in table_configs:
            try:
                row_count = spark.table(table_name).count()
                logger.info(f"Total rows in {table_name}: {row_count}")
                
                if row_count > 0:
                    logger.info(f"Sample data from {table_name}:")
                    spark.table(table_name).show(3, truncate=False)
            except Exception as e:
                logger.warning(f"Could not display statistics for {table_name}: {str(e)}")
        
        # Show summary analytics
        logger.info("=== WORKSPACE ACCESS SUMMARY ===")
        
        try:
            # Core access summary
            core_summary = spark.sql(f"""
                SELECT 
                    workspace_id,
                    COUNT(*) as total_principals,
                    COUNT(DISTINCT principal_type) as unique_principal_types,
                    COUNT(DISTINCT workspace_role) as unique_roles
                FROM {CONFIG["ACCESS_CORE_TABLE_NAME"]}
                WHERE workspace_id = '{target_workspace_id}'
                GROUP BY workspace_id
            """)
            
            logger.info("Core access summary:")
            core_summary.show(truncate=False)
            
            # Principal type distribution
            principal_distribution = spark.sql(f"""
                SELECT 
                    principal_type,
                    workspace_role,
                    COUNT(*) as count
                FROM {CONFIG["ACCESS_CORE_TABLE_NAME"]}
                WHERE workspace_id = '{target_workspace_id}'
                GROUP BY principal_type, workspace_role
                ORDER BY principal_type, workspace_role
            """)
            
            logger.info("Principal type and role distribution:")
            principal_distribution.show(truncate=False)
            
        except Exception as e:
            logger.warning(f"Could not generate summary analytics: {str(e)}")
        
        # Return the core DataFrame for further analysis if needed
        return {
            "core_access": core_access_df,
            "user_details": user_details_df,
            "group_details": group_details_df,
            "service_principal_details": sp_details_df,
            "service_principal_profile_details": spp_details_df
        }
        
    except Exception as e:
        logger.error(f"Error in main execution: {str(e)}")
        raise
# ==================================


# CELL 12 - Execute Main Function
# ==================================
# Execute the main function
if __name__ == "__main__":
    # Option 1: Use the test workspace ID from config
    dataframes = main()
    
    # Option 2: Specify a workspace ID directly (uncomment and replace with actual ID)
    # dataframes = main("f089354e-8366-4e18-aea3-4cb4a3a50b48")
# ==================================


# CELL 13 - Utility Functions for Analysis
# ==================================
def analyze_workspace_access(workspace_id: str):
    """
    Perform detailed analysis of workspace access patterns.
    
    This function provides various analytical queries to understand
    the access patterns for a specific workspace.
    
    Args:
        workspace_id: The workspace ID to analyze
    """
    logger.info(f"Analyzing workspace access patterns for workspace: {workspace_id}")
    
    try:
        # 1. Overall access summary
        print("=== WORKSPACE ACCESS ANALYSIS ===\n")
        
        overall_summary = spark.sql(f"""
            SELECT 
                'Total Principals' as metric,
                CAST(COUNT(*) as STRING) as value
            FROM {CONFIG["ACCESS_CORE_TABLE_NAME"]}
            WHERE workspace_id = '{workspace_id}'
            
            UNION ALL
            
            SELECT 
                'Unique Principal Types' as metric,
                CAST(COUNT(DISTINCT principal_type) as STRING) as value
            FROM {CONFIG["ACCESS_CORE_TABLE_NAME"]}
            WHERE workspace_id = '{workspace_id}'
            
            UNION ALL
            
            SELECT 
                'Admin Users' as metric,
                CAST(COUNT(*) as STRING) as value
            FROM {CONFIG["ACCESS_CORE_TABLE_NAME"]}
            WHERE workspace_id = '{workspace_id}' AND workspace_role = 'Admin'
        """)
        
        print("Overall Summary:")
        overall_summary.show(truncate=False)
        
        # 2. Detailed breakdowns
        role_breakdown = spark.sql(f"""
            SELECT 
                workspace_role,
                principal_type,
                COUNT(*) as count,
                ROUND(COUNT(*) * 100.0 / SUM(COUNT(*)) OVER (), 2) as percentage
            FROM {CONFIG["ACCESS_CORE_TABLE_NAME"]}
            WHERE workspace_id = '{workspace_id}'
            GROUP BY workspace_role, principal_type
            ORDER BY workspace_role, principal_type
        """)
        
        print("\nRole and Principal Type Breakdown:")
        role_breakdown.show(truncate=False)
        
        # 3. User details (if any users exist)
        user_details_query = f"""
            SELECT 
                c.principal_display_name,
                c.workspace_role,
                u.user_principal_name,
                c.last_updated_timestamp
            FROM {CONFIG["ACCESS_CORE_TABLE_NAME"]} c
            LEFT JOIN {CONFIG["ACCESS_USERS_TABLE_NAME"]} u 
                ON c.principal_id = u.principal_id
            WHERE c.workspace_id = '{workspace_id}' 
                AND c.principal_type = 'User'
            ORDER BY c.workspace_role, c.principal_display_name
        """
        
        user_details_result = spark.sql(user_details_query)
        if user_details_result.count() > 0:
            print("\nUser Access Details:")
            user_details_result.show(truncate=False)
        
        # 4. Group details (if any groups exist)
        group_details_query = f"""
            SELECT 
                c.principal_display_name,
                c.workspace_role,
                g.group_type,
                c.last_updated_timestamp
            FROM {CONFIG["ACCESS_CORE_TABLE_NAME"]} c
            LEFT JOIN {CONFIG["ACCESS_GROUPS_TABLE_NAME"]} g 
                ON c.principal_id = g.principal_id
            WHERE c.workspace_id = '{workspace_id}' 
                AND c.principal_type = 'Group'
            ORDER BY c.workspace_role, c.principal_display_name
        """
        
        group_details_result = spark.sql(group_details_query)
        if group_details_result.count() > 0:
            print("\nGroup Access Details:")
            group_details_result.show(truncate=False)
        
        # 5. Service principal details (if any exist)
        sp_details_query = f"""
            SELECT 
                c.principal_display_name,
                c.workspace_role,
                sp.aad_app_id,
                c.last_updated_timestamp
            FROM {CONFIG["ACCESS_CORE_TABLE_NAME"]} c
            LEFT JOIN {CONFIG["ACCESS_SERVICE_PRINCIPALS_TABLE_NAME"]} sp 
                ON c.principal_id = sp.principal_id
            WHERE c.workspace_id = '{workspace_id}' 
                AND c.principal_type = 'ServicePrincipal'
            ORDER BY c.workspace_role, c.principal_display_name
        """
        
        sp_details_result = spark.sql(sp_details_query)
        if sp_details_result.count() > 0:
            print("\nService Principal Access Details:")
            sp_details_result.show(truncate=False)
        
    except Exception as e:
        logger.error(f"Error in workspace access analysis: {str(e)}")
        raise


def get_workspace_security_report(workspace_id: str):
    """
    Generate a security-focused report for the workspace.
    
    Args:
        workspace_id: The workspace ID to analyze
    
    Returns:
        dict: Security metrics and findings
    """
    logger.info(f"Generating security report for workspace: {workspace_id}")
    
    try:
        # Count admins
        admin_count = spark.sql(f"""
            SELECT COUNT(*) as count
            FROM {CONFIG["ACCESS_CORE_TABLE_NAME"]}
            WHERE workspace_id = '{workspace_id}' AND workspace_role = 'Admin'
        """).collect()[0]['count']
        
        # Count external/service principals
        sp_count = spark.sql(f"""
            SELECT COUNT(*) as count
            FROM {CONFIG["ACCESS_CORE_TABLE_NAME"]}
            WHERE workspace_id = '{workspace_id}' AND principal_type IN ('ServicePrincipal', 'ServicePrincipalProfile')
        """).collect()[0]['count']
        
        # Count groups
        group_count = spark.sql(f"""
            SELECT COUNT(*) as count
            FROM {CONFIG["ACCESS_CORE_TABLE_NAME"]}
            WHERE workspace_id = '{workspace_id}' AND principal_type = 'Group'
        """).collect()[0]['count']
        
        # Total principals
        total_count = spark.sql(f"""
            SELECT COUNT(*) as count
            FROM {CONFIG["ACCESS_CORE_TABLE_NAME"]}
            WHERE workspace_id = '{workspace_id}'
        """).collect()[0]['count']
        
        security_report = {
            "workspace_id": workspace_id,
            "total_principals": total_count,
            "admin_count": admin_count,
            "service_principal_count": sp_count,
            "group_count": group_count,
            "admin_percentage": round((admin_count / total_count * 100) if total_count > 0 else 0, 2),
            "service_principal_percentage": round((sp_count / total_count * 100) if total_count > 0 else 0, 2)
        }
        
        # Security recommendations
        recommendations = []
        if admin_count > 5:
            recommendations.append("Consider reducing the number of workspace administrators")
        if sp_count > total_count * 0.3:
            recommendations.append("High number of service principals - review for necessity")
        if group_count == 0:
            recommendations.append("Consider using groups for easier access management")
        
        security_report["recommendations"] = recommendations
        
        logger.info("Security Report Generated:")
        for key, value in security_report.items():
            logger.info(f"  {key}: {value}")
        
        return security_report
        
    except Exception as e:
        logger.error(f"Error generating security report: {str(e)}")
        raise


# Example usage function
def run_analysis_example():
    """
    Example function showing how to use the analysis functions.
    Update the workspace_id to match your test workspace.
    """
    # Replace with your actual workspace ID
    example_workspace_id = CONFIG["TEST_WORKSPACE_ID"]
    
    if example_workspace_id != "your-workspace-id-here":
        print("Running workspace access analysis example...")
        analyze_workspace_access(example_workspace_id)
        
        print("\n" + "="*50)
        print("Security Report:")
        security_report = get_workspace_security_report(example_workspace_id)
    else:
        print("Please update the TEST_WORKSPACE_ID in CONFIG to run the analysis example")

# Uncomment the line below to run the analysis example
# run_analysis_example()
# ==================================


# CELL 14 - Maintenance and Best Practices
# ==================================
"""
MAINTENANCE AND BEST PRACTICES FOR WORKSPACE ACCESS DETAILS:

1. SCHEDULED UPDATES:
   - Schedule this notebook to run regularly (daily/weekly) to keep access data current
   - Use Fabric pipelines to orchestrate multiple workspace processing
   - Consider processing multiple workspaces in batches

2. MULTI-WORKSPACE PROCESSING:
   - Modify main() function to accept a list of workspace IDs
   - Implement parallel processing for large numbers of workspaces
   - Use workspace metadata from the workspaces API to get all workspace IDs

3. DELTA LAKE MAINTENANCE:
   - Run VACUUM periodically on all tables to clean old files
   - Monitor table sizes and partition strategies
   - Use time travel for auditing access changes over time

4. SECURITY AND COMPLIANCE:
   - Track admin access changes over time
   - Monitor service principal access patterns
   - Create alerts for unexpected access modifications
   - Implement data retention policies for access history

5. ANALYTICS AND REPORTING:
   - Create Power BI dashboards for access governance
   - Build reports showing access patterns across workspaces
   - Track compliance with least-privilege principles
   - Monitor external access (service principals)

6. PERFORMANCE OPTIMIZATION:
   - Consider partitioning core table by workspace_id for large tenants
   - Create indexes on frequently queried columns
   - Use Delta table clustering for better query performance

7. DATA QUALITY MONITORING:
   - Implement checks for orphaned principal details
   - Monitor for schema changes in API responses
   - Validate referential integrity between tables
   - Track API response times and error rates

8. INTEGRATION PATTERNS:
   - Combine with Azure AD data for complete identity picture
   - Link with workspace usage metrics for access analytics
   - Integrate with SIEM systems for security monitoring

Example queries for ongoing analysis:

-- Find workspaces with too many admins
SELECT workspace_id, COUNT(*) as admin_count
FROM fabric_workspace_access_core
WHERE workspace_role = 'Admin'
GROUP BY workspace_id
HAVING COUNT(*) > 5;

-- Track access changes over time (requires historical data)
SELECT 
    workspace_id,
    principal_type,
    workspace_role,
    COUNT(*) as current_count,
    LAG(COUNT(*)) OVER (PARTITION BY workspace_id, principal_type, workspace_role ORDER BY DATE(last_updated_timestamp)) as previous_count
FROM fabric_workspace_access_core
GROUP BY workspace_id, principal_type, workspace_role, DATE(last_updated_timestamp);

-- Find service principals with admin access
SELECT 
    c.workspace_id,
    c.principal_display_name,
    sp.aad_app_id,
    c.last_updated_timestamp
FROM fabric_workspace_access_core c
JOIN fabric_workspace_access_service_principals sp ON c.principal_id = sp.principal_id
WHERE c.workspace_role = 'Admin';

9. ERROR HANDLING AND MONITORING:
   - Implement comprehensive logging for all operations
   - Set up alerts for API failures or unexpected data patterns
   - Create dashboards for monitoring ETL process health
   - Implement retry logic with exponential backoff for resilience

10. FUTURE EXTENSIBILITY:
    - Design for new principal types that may be added to the API
    - Plan for additional workspace access details fields
    - Consider federation with other Microsoft 365 access data
    - Prepare for potential API versioning changes
"""
# ==================================


StatementMeta(, 35c566ec-c3b1-4832-81ff-4c8b43c6ca8e, 3, Finished, Available, Finished)

2025-05-27 19:09:17,636 - INFO - Starting Fabric Workspace Access Details to Delta Lake process for workspace: 7a21dc44-c8b8-446e-9e80-59458a88ece8
2025-05-27 19:09:17,637 - INFO - Getting access token...
2025-05-27 19:09:18,513 - INFO - Successfully obtained access token
2025-05-27 19:09:18,514 - INFO - Retrieving access details for workspace 7a21dc44-c8b8-446e-9e80-59458a88ece8...
2025-05-27 19:09:18,514 - INFO - Retrieving access details for workspace: 7a21dc44-c8b8-446e-9e80-59458a88ece8
2025-05-27 19:09:18,515 - INFO - Making API call to: https://api.fabric.microsoft.com/v1/admin/workspaces/7a21dc44-c8b8-446e-9e80-59458a88ece8/users with params: None (Attempt 1)
2025-05-27 19:09:18,797 - INFO - Response status: 200
2025-05-27 19:09:18,798 - INFO - Response contains 12 items in 'accessDetails' array
2025-05-27 19:09:18,801 - INFO - Response keys: ['accessDetails']
2025-05-27 19:09:18,802 - INFO - Retrieved 12 access details for workspace 7a21dc44-c8b8-446e-9e80-59458a88ece8
2025-05

+------------------------------------+------------------------------------+---------------------------+--------------+--------------+--------------+--------------------------+
|workspace_id                        |principal_id                        |principal_display_name     |principal_type|workspace_role|workspace_type|last_updated_timestamp    |
+------------------------------------+------------------------------------+---------------------------+--------------+--------------+--------------+--------------------------+
|7a21dc44-c8b8-446e-9e80-59458a88ece8|123b890b-86fe-4bd3-91b0-54ae0e368745|Gatilao,Rey Allen T Gatilao|User          |Admin         |Workspace     |2025-05-27 19:09:20.509256|
|7a21dc44-c8b8-446e-9e80-59458a88ece8|f12096e4-4ee8-4adc-8f64-1b25cc463fc4|Brent Hand                 |User          |Admin         |Workspace     |2025-05-27 19:09:20.509256|
|7a21dc44-c8b8-446e-9e80-59458a88ece8|0b536f4a-3e41-4411-8c0a-806c1935121e|Morgan Jolley              |User          |Ad

"\nMAINTENANCE AND BEST PRACTICES FOR WORKSPACE ACCESS DETAILS:\n\n1. SCHEDULED UPDATES:\n   - Schedule this notebook to run regularly (daily/weekly) to keep access data current\n   - Use Fabric pipelines to orchestrate multiple workspace processing\n   - Consider processing multiple workspaces in batches\n\n2. MULTI-WORKSPACE PROCESSING:\n   - Modify main() function to accept a list of workspace IDs\n   - Implement parallel processing for large numbers of workspaces\n   - Use workspace metadata from the workspaces API to get all workspace IDs\n\n3. DELTA LAKE MAINTENANCE:\n   - Run VACUUM periodically on all tables to clean old files\n   - Monitor table sizes and partition strategies\n   - Use time travel for auditing access changes over time\n\n4. SECURITY AND COMPLIANCE:\n   - Track admin access changes over time\n   - Monitor service principal access patterns\n   - Create alerts for unexpected access modifications\n   - Implement data retention policies for access history\n\n5. ANA