In [3]:
# DO NOT DELETE THIS CELL

# ***IMPORTANT***: Work in progress (WIP). Need to look into 403 errors; may need to grant my account access to each connection;
# This is filtered data. I'm only pulling items on the P1 capacity. See query on line 68. Not all fabric items have connections. 
# See what items are being filtered out in WHERE clause. Also, a particular workspace is being targeted for testing.

# API Name: Items - List Item Connections
# Command:  GET https://api.fabric.microsoft.com/v1/workspaces/{workspaceId}/items/{itemId}/connections
# Doc:      https://learn.microsoft.com/en-us/rest/api/fabric/core/items/list-item-connections

# Loads table: fabric_items_connections

StatementMeta(, d586d30c-5929-4ada-8fa4-df0be3edb891, 5, Finished, Available, Finished)

In [4]:
# CELL 1 - Title and Introduction
# ==================================
# Microsoft Fabric Item Connections to Delta Lake - PySpark Notebook
# This notebook retrieves Microsoft Fabric item connections and loads them into a Delta Lake table
# with optimization for analytics workloads
# ==================================


# CELL 2 - Import Libraries
# ==================================
# Import required libraries
import requests
import json
import pandas as pd
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, current_timestamp, lit, from_json, md5, concat_ws
from pyspark.sql.types import StructType, StructField, StringType, TimestampType
import logging
from typing import Dict, List, Optional, Tuple
from delta.tables import DeltaTable
import random
import time
import hashlib
# ==================================


# CELL 3 - Configure Logging and Initialize Spark
# ==================================
# Configure logging
# This helps us track what's happening in our code and debug issues
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)

# Initialize Spark Session with Delta Lake configurations
# In Fabric notebooks, Spark is pre-configured with Delta support
spark = SparkSession.builder \
    .appName("FabricItemConnectionsToDelta") \
    .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension") \
    .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog") \
    .getOrCreate()

# Set optimal configurations for Delta operations
spark.conf.set("spark.databricks.delta.optimizeWrite.enabled", "true")
spark.conf.set("spark.databricks.delta.autoCompact.enabled", "true")
# ==================================


# CELL 4 - Configuration Parameters
# ==================================
# Configuration Parameters
# These are the settings we'll use throughout the notebook
CONFIG = {
    "API_BASE_URL": "https://api.fabric.microsoft.com/v1",
    "CONNECTIONS_ENDPOINT_TEMPLATE": "/workspaces/{workspace_id}/items/{item_id}/connections",  # Template for item connections endpoint
    "MAX_RETRIES": 3,  # Increased number of retries for handling rate limits
    "INITIAL_BACKOFF_SEC": 1,  # Initial backoff time in seconds
    "MAX_BACKOFF_SEC": 60,  # Maximum backoff time in seconds
    "BACKOFF_FACTOR": 2,  # Exponential backoff multiplier
    "JITTER_FACTOR": 0.1,  # Random jitter to add to backoff (as a fraction)
    "TIMEOUT": 30,  # API request timeout in seconds
    "CONNECTIONS_TABLE_NAME": "fabric_items_connections",  # Name of the target Delta table
    "LAKEHOUSE_PATH": "Tables",  # Default Tables folder in Fabric Lakehouse
    "DEBUG_MODE": True,  # Set to True to enable extra debugging output
    "BATCH_SIZE": 20,  # Number of items to process in each batch (to manage rate limits)
    "BATCH_DELAY_SEC": 2  # Delay between batches to avoid overwhelming the API
}

# Source query to get workspace and item IDs
WORKSPACE_ITEMS_QUERY = """
SELECT 
    fw.id AS WorkspaceID, 
    FI.id AS ItemID 
FROM 
    FabricAdmin_Lakehouse.dbo.fabric_capacities AS fc
JOIN 
    FabricAdmin_Lakehouse.dbo.fabric_workspaces AS fw
    ON UPPER(fc.id) = fw.capacityId  -- Join on capacityId
JOIN 
    FabricAdmin_Lakehouse.dbo.fabric_items AS FI
    ON fw.id = FI.workspace_id
WHERE 
    fc.displayName = 'MDA Institutional Capacity - PROD'  -- Filter for specific displayName
    AND fw.state = 'Active'  -- Filter for active workspaces
    AND fw.type <> 'Personal'  -- Exclude personal workspaces
    AND FI.type NOT IN ('Report','Dashboard','Notebook')
ORDER BY
    fw.name,  FI.type, FI.name
"""
# ==================================


# CELL 5 - Authentication Function
# ==================================
def get_access_token():
    """
    Get Azure AD access token for Fabric API authentication.
    
    In a Fabric notebook, the token is automatically available through mssparkutils.
    This function retrieves the token that's needed to authenticate with the Fabric REST API.
    
    Returns:
        str: The access token
    
    Note:
        mssparkutils is a utility library provided by Microsoft Fabric
        that handles authentication automatically.
    """
    try:
        # In Fabric notebooks, we can get the token using mssparkutils
        from notebookutils import mssparkutils
        token_response = mssparkutils.credentials.getToken("https://api.fabric.microsoft.com")
        return token_response
    except Exception as e:
        logger.error(f"Failed to get access token: {str(e)}")
        raise
# ==================================


# CELL 6 - API Call Function
# ==================================
def call_fabric_api(endpoint: str, access_token: str, params: Optional[Dict] = None) -> Dict:
    """
    Make a REST API call to Microsoft Fabric with advanced rate limit handling.
    
    This function handles the HTTP request to the Fabric API, including:
    - Setting up authentication headers
    - Managing retries with intelligent backoff for rate limiting (429 errors)
    - Implementing jitter to avoid synchronized retries
    - Detailed error handling and logging
    
    Args:
        endpoint: The API endpoint path (e.g., "/workspaces/{id}/items/{id}/connections")
        access_token: The Azure AD access token
        params: Optional query parameters for the API call
    
    Returns:
        dict: The JSON response from the API
    
    Raises:
        requests.exceptions.RequestException: If the API call fails after all retries
    """
    url = f"{CONFIG['API_BASE_URL']}{endpoint}"
    headers = {
        "Authorization": f"Bearer {access_token}",
        "Content-Type": "application/json"
    }
    
    # Initialize backoff time
    backoff_time = CONFIG['INITIAL_BACKOFF_SEC']
    
    # Retry logic with intelligent backoff
    for attempt in range(CONFIG['MAX_RETRIES']):
        try:
            # Log the request for debugging
            if CONFIG['DEBUG_MODE']:
                logger.info(f"Making API call to: {url} with params: {params} (Attempt {attempt + 1})")
            
            response = requests.get(
                url,
                headers=headers,
                params=params,
                timeout=CONFIG['TIMEOUT']
            )
            
            # Log the response status
            if CONFIG['DEBUG_MODE']:
                logger.info(f"Response status: {response.status_code}")
            
            # Rate limit handling (429 Too Many Requests)
            if response.status_code == 429:
                # Get retry-after header if available, otherwise use our backoff
                retry_after = response.headers.get('Retry-After')
                
                if retry_after and retry_after.isdigit():
                    # If server specified a wait time, use it
                    wait_time = int(retry_after)
                else:
                    # Calculate wait time with exponential backoff and jitter
                    jitter = random.uniform(0, CONFIG['JITTER_FACTOR'] * backoff_time)
                    wait_time = backoff_time + jitter
                    # Update backoff for next attempt
                    backoff_time = min(backoff_time * CONFIG['BACKOFF_FACTOR'], CONFIG['MAX_BACKOFF_SEC'])
                
                logger.warning(f"Rate limit exceeded (429). Waiting {wait_time:.2f} seconds before retry.")
                time.sleep(wait_time)
                continue  # Skip to next retry attempt without raising exception
            
            # Handle 404 errors gracefully (item may not have connections)
            if response.status_code == 404:
                logger.info(f"No connections found for endpoint: {endpoint}")
                return {"value": []}  # Return empty result structure
            
            # Handle 400 errors with "OperationNotSupportedForItem" gracefully
            if response.status_code == 400:
                try:
                    error_response = response.json()
                    if error_response.get("errorCode") == "OperationNotSupportedForItem":
                        logger.info(f"Item does not support connections API (expected for some item types): {endpoint}")
                        return {"value": []}  # Return empty result structure
                except json.JSONDecodeError:
                    pass  # Fall through to regular error handling
            
            # Log the response for debugging in case of other errors
            if response.status_code >= 400:
                logger.error(f"API error: Status {response.status_code}, Response: {response.text}")
                logger.error(f"Request URL: {response.request.url}")
                logger.error(f"Request headers: {response.request.headers}")
            
            # For all other status codes, use raise_for_status to handle
            response.raise_for_status()
            
            # If we get here, the request was successful
            # Reset backoff for next API call (not next attempt)
            backoff_time = CONFIG['INITIAL_BACKOFF_SEC']
            
            # Parse and return the JSON response
            try:
                response_json = response.json()
                if CONFIG['DEBUG_MODE']:
                    if "value" in response_json and isinstance(response_json["value"], list):
                        logger.info(f"Response contains {len(response_json['value'])} connections")
                return response_json
            except json.JSONDecodeError as e:
                logger.error(f"Failed to parse response as JSON: {str(e)}")
                logger.error(f"Response content: {response.text[:1000]}")  # Log first 1000 chars of response
                raise
            
        except requests.exceptions.RequestException as e:
            last_attempt = attempt == CONFIG['MAX_RETRIES'] - 1
            
            # Special handling for non-429 errors
            if not (hasattr(e, 'response') and e.response is not None and e.response.status_code == 429):
                logger.warning(f"API call failed (Attempt {attempt + 1}): {str(e)}")
                
                if last_attempt:
                    logger.error(f"All retry attempts failed for endpoint: {endpoint}")
                    logger.error(f"Final error: {str(e)}")
                    raise
                
                # Calculate wait time with exponential backoff and jitter
                jitter = random.uniform(0, CONFIG['JITTER_FACTOR'] * backoff_time)
                wait_time = backoff_time + jitter
                # Update backoff for next attempt
                backoff_time = min(backoff_time * CONFIG['BACKOFF_FACTOR'], CONFIG['MAX_BACKOFF_SEC'])
                
                logger.info(f"Waiting {wait_time:.2f} seconds before retry.")
                time.sleep(wait_time)
# ==================================


# CELL 7 - Get Workspace Items Function
# ==================================
def get_workspace_items() -> List[Tuple[str, str]]:
    """
    Get workspace and item IDs from the existing Fabric tables.
    
    This function executes the predefined SQL query to retrieve workspace and item IDs
    from the fabric_workspaces and fabric_items tables.
    
    Returns:
        list: A list of tuples containing (workspace_id, item_id) pairs
    """
    try:
        logger.info("Executing query to get workspace and item IDs...")
        logger.info(f"Query: {WORKSPACE_ITEMS_QUERY}")
        
        # Execute the query using Spark SQL
        result_df = spark.sql(WORKSPACE_ITEMS_QUERY)
        
        # Show the results for debugging
        logger.info("Query results:")
        result_df.show(truncate=False)
        
        # Convert to list of tuples
        workspace_items = [(row.WorkspaceID, row.ItemID) for row in result_df.collect()]
        
        logger.info(f"Found {len(workspace_items)} workspace-item pairs to process")
        
        return workspace_items
        
    except Exception as e:
        logger.error(f"Failed to get workspace items: {str(e)}")
        raise
# ==================================


# CELL 8 - Get Item Connections Function
# ==================================
def get_item_connections(workspace_id: str, item_id: str, access_token: str) -> List[Dict]:
    """
    Retrieve all connections for a specific item, handling pagination.
    
    This function makes requests to the List Item Connections API endpoint and
    handles pagination using the continuationToken to retrieve all connections.
    
    Args:
        workspace_id: The workspace ID containing the item
        item_id: The item ID to get connections for
        access_token: The Azure AD access token
    
    Returns:
        list: A list of all connection objects for the item
    """
    all_connections = []
    continuation_token = None
    page_count = 0
    
    # Build the endpoint URL
    endpoint = CONFIG['CONNECTIONS_ENDPOINT_TEMPLATE'].format(
        workspace_id=workspace_id, 
        item_id=item_id
    )
    
    try:
        while True:
            page_count += 1
            
            # Prepare parameters for pagination
            params = {}
            if continuation_token:
                params['continuationToken'] = continuation_token
                logger.info(f"Page {page_count}: Making request with continuation token for item {item_id}")
            else:
                logger.info(f"Page {page_count}: Making initial request for item {item_id}")
            
            # Make the API call
            response_data = call_fabric_api(endpoint, access_token, params)
            
            # Extract connections from the response
            connections = response_data.get("value", [])
            
            if connections:
                # Add workspace_id and item_id to each connection for context
                for connection in connections:
                    connection['_workspace_id'] = workspace_id
                    connection['_item_id'] = item_id
                
                all_connections.extend(connections)
                logger.info(f"Retrieved {len(connections)} connections on page {page_count} for item {item_id}")
                
                # Log sample connection for debugging
                if CONFIG['DEBUG_MODE'] and connections:
                    logger.info(f"Sample connection: {json.dumps(connections[0], indent=2)}")
            else:
                logger.info(f"No connections found on page {page_count} for item {item_id}")
            
            # Check if there are more pages
            continuation_token = response_data.get("continuationToken")
            
            if continuation_token:
                logger.info(f"Found continuation token for item {item_id}: {continuation_token}")
            else:
                logger.info(f"No continuation token found for item {item_id} - finished")
                break
        
    except Exception as e:
        error_msg = str(e)
        
        # Check if this is the "OperationNotSupportedForItem" error
        if "OperationNotSupportedForItem" in error_msg:
            logger.info(f"Item {item_id} does not support connections API (expected for some item types)")
            return []
        
        logger.error(f"Error retrieving connections for item {item_id} in workspace {workspace_id}: {error_msg}")
        # Don't re-raise - we want to continue with other items
        return []
    
    logger.info(f"Finished retrieving connections for item {item_id}. Total count: {len(all_connections)}")
    return all_connections
# ==================================


# CELL 9 - Get All Item Connections Function
# ==================================
def get_all_item_connections(workspace_items: List[Tuple[str, str]], access_token: str) -> List[Dict]:
    """
    Retrieve connections for all workspace-item pairs with batching and rate limiting.
    
    This function processes multiple workspace-item pairs and retrieves their connections,
    implementing batching to manage API rate limits.
    
    Args:
        workspace_items: List of (workspace_id, item_id) tuples
        access_token: The Azure AD access token
    
    Returns:
        list: A list of all connection objects from all items
    """
    all_connections = []
    total_items = len(workspace_items)
    
    logger.info(f"Starting to process {total_items} workspace-item pairs")
    
    # Process items in batches to manage rate limits
    for i in range(0, total_items, CONFIG['BATCH_SIZE']):
        batch_end = min(i + CONFIG['BATCH_SIZE'], total_items)
        batch = workspace_items[i:batch_end]
        batch_num = (i // CONFIG['BATCH_SIZE']) + 1
        total_batches = (total_items + CONFIG['BATCH_SIZE'] - 1) // CONFIG['BATCH_SIZE']
        
        logger.info(f"Processing batch {batch_num}/{total_batches} (items {i+1}-{batch_end})")
        
        # Process each item in the current batch
        batch_success_count = 0
        batch_connections_count = 0
        
        for idx, (workspace_id, item_id) in enumerate(batch, start=i+1):
            logger.info(f"Processing item {idx}/{total_items}: workspace={workspace_id}, item={item_id}")
            
            try:
                # Get connections for this item
                item_connections = get_item_connections(workspace_id, item_id, access_token)
                
                if item_connections:
                    all_connections.extend(item_connections)
                    batch_connections_count += len(item_connections)
                    logger.info(f"Added {len(item_connections)} connections from item {item_id}")
                else:
                    logger.debug(f"No connections found for item {item_id}")
                
                batch_success_count += 1
                
            except Exception as e:
                logger.error(f"Failed to process item {item_id}: {str(e)}")
                # Continue with next item
        
        logger.info(f"Batch {batch_num} summary: {batch_success_count}/{len(batch)} items processed successfully, {batch_connections_count} connections found")
        
        # Add delay between batches to avoid overwhelming the API
        if batch_end < total_items:  # Don't delay after the last batch
            logger.info(f"Batch {batch_num} completed. Waiting {CONFIG['BATCH_DELAY_SEC']} seconds before next batch...")
            time.sleep(CONFIG['BATCH_DELAY_SEC'])
    
    logger.info(f"Finished processing all items. Total connections found: {len(all_connections)}")
    return all_connections
# ==================================


# CELL 10 - Create Enhanced DataFrame Function
# ==================================
def create_enhanced_connections_dataframe(connections_data: List[Dict]) -> "DataFrame":
    """
    Convert the connections data into an enhanced PySpark DataFrame for Delta Lake.
    
    This function:
    - Creates a structured DataFrame with the connection data
    - Extracts and flattens the nested JSON structure
    - Adds metadata columns for tracking and change detection
    - Generates a hash for efficient change detection
    
    Args:
        connections_data: List of connection dictionaries from the API
    
    Returns:
        DataFrame: An enhanced PySpark DataFrame ready for Delta Lake
    """
    # Extract and flatten the connection data
    flattened_connections = []
    
    for connection in connections_data:
        # Extract the main connection fields
        workspace_id = connection.get("_workspace_id")
        item_id = connection.get("_item_id")
        connection_id = connection.get("id")  # May be None for some connection types
        display_name = connection.get("displayName")  # May be None
        connectivity_type = connection.get("connectivityType", "Unknown")
        gateway_id = connection.get("gatewayId")  # May be None
        
        # Extract connection details
        connection_details = connection.get("connectionDetails", {})
        connection_details_type = connection_details.get("type", "Unknown")
        connection_details_path = connection_details.get("path", "")
        
        # Create a hash for change detection
        # Use key fields that would indicate a meaningful change
        hash_string = f"{workspace_id}|{item_id}|{connectivity_type}|{connection_details_type}|{connection_details_path}|{gateway_id or ''}"
        connection_hash = hashlib.md5(hash_string.encode()).hexdigest()
        
        flattened_connection = {
            "workspace_id": workspace_id,
            "item_id": item_id,
            "connection_id": connection_id,
            "display_name": display_name,
            "connectivity_type": connectivity_type,
            "gateway_id": gateway_id,
            "connection_details_type": connection_details_type,
            "connection_details_path": connection_details_path,
            "connection_hash": connection_hash
        }
        
        flattened_connections.append(flattened_connection)
    
    # Define the schema for the DataFrame
    schema = StructType([
        StructField("workspace_id", StringType(), False),        # Not nullable - required
        StructField("item_id", StringType(), False),             # Not nullable - required
        StructField("connection_id", StringType(), True),        # Nullable - not always present
        StructField("display_name", StringType(), True),         # Nullable - not always present
        StructField("connectivity_type", StringType(), False),   # Not nullable - always present
        StructField("gateway_id", StringType(), True),           # Nullable - only for gateway connections
        StructField("connection_details_type", StringType(), False),  # Not nullable - always present
        StructField("connection_details_path", StringType(), False),  # Not nullable - always present
        StructField("connection_hash", StringType(), False),     # Not nullable - calculated
        StructField("extraction_timestamp", TimestampType(), False)  # Not nullable - added by us
    ])
    
    # Create DataFrame
    if not flattened_connections:
        logger.warning("No connections found. Creating empty DataFrame.")
        # Create an empty DataFrame with the schema
        empty_rdd = spark.sparkContext.emptyRDD()
        enhanced_df = spark.createDataFrame(empty_rdd, schema)
        return enhanced_df
    
    # Convert to pandas DataFrame first for easier handling
    pandas_df = pd.DataFrame(flattened_connections)
    
    # Ensure all columns exist and handle None values appropriately
    required_columns = ["workspace_id", "item_id", "connection_id", "display_name", 
                       "connectivity_type", "gateway_id", "connection_details_type", 
                       "connection_details_path", "connection_hash"]
    
    for col_name in required_columns:
        if col_name not in pandas_df.columns:
            pandas_df[col_name] = None
    
    # Create the initial Spark DataFrame (excluding extraction_timestamp)
    spark_df = spark.createDataFrame(pandas_df[required_columns])
    
    # Add metadata column for tracking when this data was extracted
    enhanced_df = spark_df.withColumn("extraction_timestamp", current_timestamp())
    
    return enhanced_df
# ==================================


# CELL 11 - Delta Lake Operations Functions
# ==================================
def ensure_delta_table_exists(table_name: str, df_schema):
    """
    Ensure the Delta table exists, creating it if necessary.
    
    Args:
        table_name: Name of the Delta table
        df_schema: Schema of the DataFrame
    """
    try:
        # Check if table exists
        spark.sql(f"DESCRIBE TABLE {table_name}")
        logger.info(f"Delta table '{table_name}' already exists")
    except Exception:
        # Table doesn't exist, create it
        logger.info(f"Creating Delta table '{table_name}'")
        
        # Create an empty DataFrame with the schema
        empty_df = spark.createDataFrame([], df_schema)
        
        # Create the Delta table
        empty_df.write \
            .mode("overwrite") \
            .option("overwriteSchema", "true") \
            .saveAsTable(table_name)
        
        logger.info(f"Delta table '{table_name}' created successfully")


def merge_data_to_delta(source_df, table_name: str):
    """
    Merge new connection data into the Delta table using MERGE operation.
    
    This function performs an upsert operation:
    - Updates existing records if the composite key matches
    - Inserts new records if the composite key doesn't exist
    
    The composite key consists of: workspace_id, item_id, connection_hash
    
    Args:
        source_df: DataFrame with new data
        table_name: Name of the target Delta table
    """
    logger.info(f"Starting merge operation for {table_name}")
    
    # Create a temporary view for the merge operation
    source_df.createOrReplaceTempView("connection_updates")
    
    # If the table is empty, just insert all records
    if spark.table(table_name).count() == 0:
        logger.info(f"Table {table_name} is empty. Inserting all records.")
        source_df.write.mode("append").saveAsTable(table_name)
        return
    
    # Perform the merge operation using composite key
    merge_query = f"""
    MERGE INTO {table_name} AS target
    USING connection_updates AS source
    ON target.workspace_id = source.workspace_id 
       AND target.item_id = source.item_id 
       AND target.connection_hash = source.connection_hash
    WHEN MATCHED THEN
        UPDATE SET 
            target.connection_id = source.connection_id,
            target.display_name = source.display_name,
            target.connectivity_type = source.connectivity_type,
            target.gateway_id = source.gateway_id,
            target.connection_details_type = source.connection_details_type,
            target.connection_details_path = source.connection_details_path,
            target.extraction_timestamp = source.extraction_timestamp
    WHEN NOT MATCHED THEN
        INSERT *
    """
    
    spark.sql(merge_query)
    logger.info("Merge operation completed successfully")


def optimize_delta_table(table_name: str):
    """
    Optimize the Delta table for better query performance.
    
    This function:
    - Updates table statistics for query optimization
    - Uses a compatible method for Microsoft Fabric
    
    Args:
        table_name: Name of the Delta table to optimize
    """
    logger.info(f"Optimizing Delta table '{table_name}'")
    
    try:
        # Update table statistics for better query planning
        spark.sql(f"ANALYZE TABLE {table_name} COMPUTE STATISTICS")
        logger.info("Table statistics updated successfully")
        
        # Note: In Microsoft Fabric, Delta table optimization may be handled automatically
        # or through different commands than traditional Delta Lake
        
        logger.info("Delta table optimization completed via statistics computation")
        logger.info("Note: Microsoft Fabric may automatically optimize Delta tables")
    except Exception as e:
        logger.warning(f"Table optimization step encountered an issue: {str(e)}")
        logger.info("Continuing with process - optimization is not critical for functionality")
# ==================================


# CELL 12 - Main Execution Function
# ==================================
def main():
    """
    Main execution function that orchestrates the entire process.
    
    This function:
    1. Gets the authentication token
    2. Retrieves workspace and item IDs from existing tables
    3. Retrieves all item connections from the API
    4. Creates an enhanced PySpark DataFrame with the connection data
    5. Loads data into a Delta Lake table
    6. Optimizes the table for analytics
    """
    try:
        logger.info("Starting Fabric Item Connections to Delta Lake process")
        
        # Step 1: Get authentication token
        logger.info("Getting access token...")
        access_token = get_access_token()
        logger.info("Successfully obtained access token")
        
        # Step 2: Get workspace and item IDs
        logger.info("Retrieving workspace and item IDs...")
        workspace_items = get_workspace_items()
        
        if not workspace_items:
            logger.warning("No workspace-item pairs found. Please check the query and source tables.")
            return None
        
        # Step 3: Retrieve all item connections
        logger.info("Retrieving item connections from Fabric API...")
        connections_data = get_all_item_connections(workspace_items, access_token)
        
        if not connections_data:
            logger.warning("No connections found for any items. Creating empty DataFrame.")
            # Create empty dataframe with schema for consistent table structure
            empty_schema = StructType([
                StructField("workspace_id", StringType(), False),
                StructField("item_id", StringType(), False),
                StructField("connection_id", StringType(), True),
                StructField("display_name", StringType(), True),
                StructField("connectivity_type", StringType(), False),
                StructField("gateway_id", StringType(), True),
                StructField("connection_details_type", StringType(), False),
                StructField("connection_details_path", StringType(), False),
                StructField("connection_hash", StringType(), False),
                StructField("extraction_timestamp", TimestampType(), False)
            ])
            connections_df = spark.createDataFrame([], empty_schema)
        else:
            # Step 4: Create enhanced DataFrame
            logger.info(f"Creating DataFrame for {len(connections_data)} connections...")
            connections_df = create_enhanced_connections_dataframe(connections_data)
        
        # Show sample data
        logger.info("Sample of enhanced connections data:")
        connections_df.show(5, truncate=False)
        
        # Step 5: Prepare Delta table
        table_name = CONFIG["CONNECTIONS_TABLE_NAME"]
        ensure_delta_table_exists(table_name, connections_df.schema)
        
        # Step 6: Merge data into Delta table (if we have data)
        if connections_data:
            merge_data_to_delta(connections_df, table_name)
            
            # Step 7: Optimize the Delta table
            optimize_delta_table(table_name)
        
        # Step 8: Display final statistics
        logger.info("Loading completed successfully!")
        
        # Show table information
        spark.sql(f"DESCRIBE DETAIL {table_name}").show(truncate=False)
        
        # Show row count
        row_count = spark.table(table_name).count()
        logger.info(f"Total rows in {table_name}: {row_count}")
        
        # Show summary statistics
        summary_stats = spark.sql(f"""
            SELECT 
                COUNT(*) as total_connections,
                COUNT(DISTINCT workspace_id) as unique_workspaces,
                COUNT(DISTINCT item_id) as unique_items,
                COUNT(DISTINCT connectivity_type) as connectivity_types,
                COUNT(DISTINCT connection_details_type) as connection_detail_types,
                MAX(extraction_timestamp) as last_updated
            FROM {table_name}
        """)
        
        logger.info("Summary statistics:")
        summary_stats.show(truncate=False)
        
        # Optional: Show distribution by connectivity type
        connectivity_distribution = spark.sql(f"""
            SELECT 
                connectivity_type,
                COUNT(*) as count
            FROM {table_name}
            GROUP BY connectivity_type
            ORDER BY count DESC
        """)
        
        logger.info("Connection distribution by connectivity type:")
        connectivity_distribution.show(truncate=False)
        
        # Optional: Show distribution by connection details type
        details_distribution = spark.sql(f"""
            SELECT 
                connection_details_type,
                COUNT(*) as count
            FROM {table_name}
            GROUP BY connection_details_type
            ORDER BY count DESC
        """)
        
        logger.info("Connection distribution by connection details type:")
        details_distribution.show(truncate=False)
        
        return connections_df
        
    except Exception as e:
        logger.error(f"Error in main execution: {str(e)}")
        raise
# ==================================


# CELL 13 - Execute Main Function
# ==================================
# Execute the main function
if __name__ == "__main__":
    connections_df = main()
# ==================================

StatementMeta(, d586d30c-5929-4ada-8fa4-df0be3edb891, 6, Finished, Available, Finished)

2025-07-16 18:19:38,089 - INFO - Starting Fabric Item Connections to Delta Lake process
2025-07-16 18:19:38,090 - INFO - Getting access token...
2025-07-16 18:19:38,095 - INFO - Successfully obtained access token
2025-07-16 18:19:38,095 - INFO - Retrieving workspace and item IDs...
2025-07-16 18:19:38,096 - INFO - Executing query to get workspace and item IDs...
2025-07-16 18:19:38,096 - INFO - Query: 
SELECT 
    fw.id AS WorkspaceID, 
    FI.id AS ItemID 
FROM 
    FabricAdmin_Lakehouse.dbo.fabric_capacities AS fc
JOIN 
    FabricAdmin_Lakehouse.dbo.fabric_workspaces AS fw
    ON UPPER(fc.id) = fw.capacityId  -- Join on capacityId
JOIN 
    FabricAdmin_Lakehouse.dbo.fabric_items AS FI
    ON fw.id = FI.workspace_id
WHERE 
    fc.displayName = 'MDA Institutional Capacity - PROD'  -- Filter for specific displayName
    AND fw.state = 'Active'  -- Filter for active workspaces
    AND fw.type <> 'Personal'  -- Exclude personal workspaces
    AND FI.type NOT IN ('Report','Dashboard','Note

+------------------------------------+------------------------------------+
|WorkspaceID                         |ItemID                              |
+------------------------------------+------------------------------------+
|3dc16785-8298-4567-880a-9919f074555f|fe1918d6-0a63-4b30-9208-afc8e5c1d5df|
|6cf9f4f0-0f1a-4514-82d6-ac6eb63f40ba|f08cffd2-bd9f-4c46-b4bd-1c58f9153fde|
|6cf9f4f0-0f1a-4514-82d6-ac6eb63f40ba|69653dc2-dcee-4688-b832-b17a2438dc9a|
|6cf9f4f0-0f1a-4514-82d6-ac6eb63f40ba|b335dadd-91e1-4bd5-9286-fc1b11eef750|
|6cf9f4f0-0f1a-4514-82d6-ac6eb63f40ba|8beeb061-c1e7-4717-aaf8-f7f0de781d80|
|6cf9f4f0-0f1a-4514-82d6-ac6eb63f40ba|0debddf5-8b75-408d-91a7-a034f529252f|
|6cf9f4f0-0f1a-4514-82d6-ac6eb63f40ba|abc87d9b-7aae-4e9a-9144-03c55b775059|
|6cf9f4f0-0f1a-4514-82d6-ac6eb63f40ba|d695a243-9542-4498-95a3-54f1f8a2176b|
|6cf9f4f0-0f1a-4514-82d6-ac6eb63f40ba|5cde3586-177e-4d76-aeef-7983cb732f69|
|6cf9f4f0-0f1a-4514-82d6-ac6eb63f40ba|3d9ff0bd-aa57-4e9c-9a2e-4e81ba4b80d1|
|6cf9f4f0-0f

In [5]:
from pyspark.sql import SparkSession

# create Spark session
spark = SparkSession.builder.appName("Refresh SQL Endpoint Metadata").getOrCreate()

# refresh the specific table
spark.sql("REFRESH TABLE fabric_items_connections")
print("Metadata refresh triggered successfully.")


StatementMeta(, d586d30c-5929-4ada-8fa4-df0be3edb891, 7, Finished, Available, Finished)

Metadata refresh triggered successfully.
