In [1]:
# DO NOT DELETE THIS CELL

# API Name: Capacities - List Capacities
# Command:  GET https://api.fabric.microsoft.com/v1/capacities
# Doc:      https://learn.microsoft.com/en-us/rest/api/fabric/core/capacities/list-capacities

# Loads table: fabric_capacities

StatementMeta(, aa104a35-c8a8-4a1c-916b-bc3ca5880acb, 3, Finished, Available, Finished)

In [2]:
# CELL 1 - Title and Introduction
# ==================================
# Microsoft Fabric Capacities to Delta Lake - PySpark Notebook
# This notebook retrieves Microsoft Fabric capacities and loads them into a Delta Lake table
# with optimization for analytics workloads
# ==================================


# CELL 2 - Import Libraries
# ==================================
# Import required libraries
import requests
import json
import pandas as pd
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, current_timestamp
from pyspark.sql.types import StructType, StructField, StringType, TimestampType
import logging
from typing import Dict, List, Optional
from delta.tables import DeltaTable
# ==================================


# CELL 3 - Configure Logging and Initialize Spark
# ==================================
# Configure logging
# This helps us track what's happening in our code and debug issues
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)

# Initialize Spark Session with Delta Lake configurations
# In Fabric notebooks, Spark is pre-configured with Delta support
spark = SparkSession.builder \
    .appName("FabricCapacitiesToDelta") \
    .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension") \
    .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog") \
    .getOrCreate()

# Set optimal configurations for Delta operations
spark.conf.set("spark.databricks.delta.optimizeWrite.enabled", "true")
spark.conf.set("spark.databricks.delta.autoCompact.enabled", "true")
# ==================================


# CELL 4 - Configuration Parameters
# ==================================
# Configuration Parameters
# These are the settings we'll use throughout the notebook
CONFIG = {
    "API_BASE_URL": "https://api.fabric.microsoft.com/v1",
    "MAX_RETRIES": 3,
    "PAGE_SIZE": 100,  # Number of items per page for API calls
    "TIMEOUT": 30,  # API request timeout in seconds
    "DELTA_TABLE_NAME": "fabric_capacities",  # Name of the Delta table
    "LAKEHOUSE_PATH": "Tables"  # Default Tables folder in Fabric Lakehouse
}
# ==================================


# CELL 5 - Authentication Function
# ==================================
def get_access_token():
    """
    Get Azure AD access token for Fabric API authentication.
    
    In a Fabric notebook, the token is automatically available through mssparkutils.
    This function retrieves the token that's needed to authenticate with the Fabric REST API.
    
    Returns:
        str: The access token
    
    Note:
        mssparkutils is a utility library provided by Microsoft Fabric
        that handles authentication automatically.
    """
    try:
        # In Fabric notebooks, we can get the token using mssparkutils
        from notebookutils import mssparkutils
        token_response = mssparkutils.credentials.getToken("https://api.fabric.microsoft.com")
        return token_response
    except Exception as e:
        logger.error(f"Failed to get access token: {str(e)}")
        raise
# ==================================


# CELL 6 - API Call Function
# ==================================
def call_fabric_api(endpoint: str, access_token: str, params: Optional[Dict] = None) -> Dict:
    """
    Make a REST API call to Microsoft Fabric.
    
    This function handles the HTTP request to the Fabric API, including:
    - Setting up authentication headers
    - Managing retries if the request fails
    - Error handling
    
    Args:
        endpoint: The API endpoint path (e.g., "/capacities")
        access_token: The Azure AD access token
        params: Optional query parameters for the API call
    
    Returns:
        dict: The JSON response from the API
    
    Raises:
        requests.exceptions.RequestException: If the API call fails after all retries
    """
    url = f"{CONFIG['API_BASE_URL']}{endpoint}"
    headers = {
        "Authorization": f"Bearer {access_token}",
        "Content-Type": "application/json"
    }
    
    # Retry logic - sometimes API calls can fail temporarily
    for attempt in range(CONFIG['MAX_RETRIES']):
        try:
            logger.info(f"Making API call to: {url} (Attempt {attempt + 1})")
            
            response = requests.get(
                url,
                headers=headers,
                params=params,
                timeout=CONFIG['TIMEOUT']
            )
            
            # Check if the request was successful
            response.raise_for_status()
            
            return response.json()
            
        except requests.exceptions.RequestException as e:
            logger.warning(f"API call failed (Attempt {attempt + 1}): {str(e)}")
            
            if attempt == CONFIG['MAX_RETRIES'] - 1:
                logger.error(f"All retry attempts failed for endpoint: {endpoint}")
                raise
            
            # Wait before retrying (exponential backoff)
            import time
            time.sleep(2 ** attempt)
# ==================================


# CELL 7 - Get All Capacities Function
# ==================================
def get_all_capacities(access_token: str) -> List[Dict]:
    """
    Retrieve all Fabric capacities, handling pagination if necessary.
    
    The Fabric API may return results in pages if there are many capacities.
    This function handles the pagination automatically to get all capacities.
    
    Args:
        access_token: The Azure AD access token
    
    Returns:
        list: A list of all capacity objects
    """
    all_capacities = []
    continuation_token = None
    
    while True:
        # Set up parameters for the API call
        params = {"top": CONFIG['PAGE_SIZE']}
        if continuation_token:
            params["continuationToken"] = continuation_token
        
        # Call the API
        response = call_fabric_api("/capacities", access_token, params)
        
        # Extract capacities from the response
        capacities = response.get("value", [])
        all_capacities.extend(capacities)
        
        logger.info(f"Retrieved {len(capacities)} capacities. Total so far: {len(all_capacities)}")
        
        # Check if there are more pages
        continuation_token = response.get("continuationToken")
        if not continuation_token:
            break
    
    logger.info(f"Finished retrieving capacities. Total count: {len(all_capacities)}")
    return all_capacities
# ==================================


# CELL 8 - Create Enhanced DataFrame Function
# ==================================
def create_enhanced_capacities_dataframe(capacities_data: List[Dict]) -> "DataFrame":
    """
    Convert the capacities data into an enhanced PySpark DataFrame for Delta Lake.
    
    This function:
    - Creates a structured DataFrame with the capacity data
    - Adds metadata columns for tracking
    - Prepares the data for optimal Delta Lake storage
    
    Args:
        capacities_data: List of capacity dictionaries from the API
    
    Returns:
        DataFrame: An enhanced PySpark DataFrame ready for Delta Lake
    """
    # Define the schema with additional metadata columns
    schema = StructType([
        StructField("id", StringType(), False),  # False = not nullable
        StructField("displayName", StringType(), True),
        StructField("sku", StringType(), True),
        StructField("region", StringType(), True),
        StructField("state", StringType(), True),
        StructField("extraction_timestamp", TimestampType(), False)
    ])
    
    # Convert the data to a DataFrame
    pandas_df = pd.DataFrame(capacities_data)
    
    # Ensure we have all required columns, filling with None if missing
    required_columns = ["id", "displayName", "sku", "region", "state"]
    for col in required_columns:
        if col not in pandas_df.columns:
            pandas_df[col] = None
    
    # Create the initial Spark DataFrame
    spark_df = spark.createDataFrame(pandas_df[required_columns])
    
    # Add metadata columns for tracking
    enhanced_df = spark_df.withColumn("extraction_timestamp", current_timestamp())
    
    return enhanced_df
# ==================================


# CELL 9 - Delta Lake Operations Functions
# ==================================
def ensure_delta_table_exists(table_name: str, df_schema):
    """
    Ensure the Delta table exists, creating it if necessary.
    
    Args:
        table_name: Name of the Delta table
        df_schema: Schema of the DataFrame
    """
    try:
        # Check if table exists
        spark.sql(f"DESCRIBE TABLE {table_name}")
        logger.info(f"Delta table '{table_name}' already exists")
    except Exception:
        # Table doesn't exist, create it
        logger.info(f"Creating Delta table '{table_name}'")
        
        # Create an empty DataFrame with the schema
        empty_df = spark.createDataFrame([], df_schema)
        
        # Create the Delta table (without partitioning)
        empty_df.write \
            .mode("overwrite") \
            .option("overwriteSchema", "true") \
            .saveAsTable(table_name)
        
        logger.info(f"Delta table '{table_name}' created successfully")


def merge_data_to_delta(source_df, table_name: str):
    """
    Merge new capacity data into the Delta table using MERGE operation.
    
    This function performs an upsert operation:
    - Updates existing records if capacity ID matches
    - Inserts new records if capacity ID doesn't exist
    
    Args:
        source_df: DataFrame with new data
        table_name: Name of the target Delta table
    """
    logger.info(f"Starting merge operation for {table_name}")
    
    # Create a temporary view for the merge operation
    source_df.createOrReplaceTempView("capacity_updates")
    
    # Perform the merge operation
    merge_query = f"""
    MERGE INTO {table_name} AS target
    USING capacity_updates AS source
    ON target.id = source.id
    WHEN MATCHED THEN
        UPDATE SET 
            target.displayName = source.displayName,
            target.sku = source.sku,
            target.region = source.region,
            target.state = source.state,
            target.extraction_timestamp = source.extraction_timestamp
    WHEN NOT MATCHED THEN
        INSERT *
    """
    
    spark.sql(merge_query)
    logger.info("Merge operation completed successfully")


def optimize_delta_table(table_name: str):
    """
    Optimize the Delta table for better query performance.
    
    This function:
    - Compacts small files (reduces file count)
    - Z-orders data by commonly queried columns
    - Updates table statistics
    
    Args:
        table_name: Name of the Delta table to optimize
    """
    logger.info(f"Optimizing Delta table '{table_name}'")
    
    # Compact files and Z-order by commonly queried columns
    spark.sql(f"""
        OPTIMIZE {table_name}
        ZORDER BY (region, sku, state)
    """)
    
    # Update table statistics for better query planning
    spark.sql(f"ANALYZE TABLE {table_name} COMPUTE STATISTICS")
    
    logger.info("Delta table optimization completed")
# ==================================


# CELL 10 - Main Execution Function
# ==================================
def main():
    """
    Main execution function that orchestrates the entire process.
    
    This function:
    1. Gets the authentication token
    2. Retrieves all capacities from the API
    3. Creates an enhanced PySpark DataFrame
    4. Loads data into a Delta Lake table
    5. Optimizes the table for analytics
    """
    try:
        logger.info("Starting Fabric Capacities to Delta Lake process")
        
        # Step 1: Get authentication token
        logger.info("Getting access token...")
        access_token = get_access_token()
        logger.info("Successfully obtained access token")
        
        # Step 2: Retrieve all capacities
        logger.info("Retrieving capacities from Fabric API...")
        capacities_data = get_all_capacities(access_token)
        
        if not capacities_data:
            logger.warning("No capacities found")
            return
        
        # Step 3: Create enhanced PySpark DataFrame
        logger.info("Creating enhanced PySpark DataFrame...")
        capacities_df = create_enhanced_capacities_dataframe(capacities_data)
        
        # Show sample data
        logger.info("Sample of enhanced data:")
        capacities_df.show(5, truncate=False)
        
        # Step 4: Prepare Delta table
        table_name = CONFIG["DELTA_TABLE_NAME"]
        ensure_delta_table_exists(table_name, capacities_df.schema)
        
        # Step 5: Merge data into Delta table
        merge_data_to_delta(capacities_df, table_name)
        
        # Step 6: Optimize the Delta table
        optimize_delta_table(table_name)
        
        # Step 7: Display final statistics
        logger.info("Loading completed successfully!")
        
        # Show table information
        spark.sql(f"DESCRIBE DETAIL {table_name}").show(truncate=False)
        
        # Show row count
        row_count = spark.table(table_name).count()
        logger.info(f"Total rows in {table_name}: {row_count}")
        
        # Show summary statistics
        summary_stats = spark.sql(f"""
            SELECT 
                COUNT(DISTINCT id) as unique_capacities,
                COUNT(DISTINCT region) as regions,
                COUNT(DISTINCT sku) as skus,
                MAX(extraction_timestamp) as last_updated
            FROM {table_name}
        """)
        
        logger.info("Summary statistics:")
        summary_stats.show(truncate=False)
        
        return capacities_df
        
    except Exception as e:
        logger.error(f"Error in main execution: {str(e)}")
        raise
# ==================================


# CELL 11 - Execute Main Function
# ==================================
# Execute the main function
if __name__ == "__main__":
    capacities_df = main()
# ==================================


# CELL 12 - Maintenance and Best Practices
# ==================================
"""
MAINTENANCE AND BEST PRACTICES:

1. SCHEDULED UPDATES:
   - Schedule this notebook to run periodically (e.g., daily/hourly)
   - Use Fabric pipelines or scheduling features
   - Consider incremental updates for large datasets

2. DELTA LAKE MAINTENANCE:
   - Run VACUUM periodically to clean old files:
     spark.sql(f"VACUUM {table_name} RETAIN 168 HOURS")
   - Monitor file sizes and compaction needs
   - Review Z-ORDER columns based on query patterns

3. MONITORING:
   - Set up alerts for data quality issues
   - Monitor table growth and storage usage
   - Track query performance in Power BI

4. POWER BI INTEGRATION:
   - Use DirectQuery mode for real-time data
   - Create relationships in Power BI data model
   - Optimize DAX measures for performance

5. ACCESS CONTROL:
   - Set appropriate permissions on the Delta table
   - Use row-level security if needed
   - Audit access to sensitive capacity data

6. PERFORMANCE TIPS:
   - Use Z-ordering for frequently filtered columns
   - Leverage columnar format (Parquet) in Delta
   - Create aggregated tables for complex calculations

Example maintenance script:
```python
# Run weekly maintenance
def weekly_maintenance():
    # Optimize table
    spark.sql(f"OPTIMIZE {CONFIG['DELTA_TABLE_NAME']}")
    
    # Clean up old files
    spark.sql(f"VACUUM {CONFIG['DELTA_TABLE_NAME']} RETAIN 168 HOURS")
    
    # Update statistics
    spark.sql(f"ANALYZE TABLE {CONFIG['DELTA_TABLE_NAME']} COMPUTE STATISTICS")
```

7. ERROR RECOVERY:
   - Implement checkpointing for long processes
   - Use Delta time travel for recovery:
     spark.read.option("versionAsOf", 1).table(table_name)
   - Keep logs for troubleshooting
"""
# ==================================

StatementMeta(, aa104a35-c8a8-4a1c-916b-bc3ca5880acb, 4, Finished, Available, Finished)

2025-07-16 13:18:32,713 - INFO - Starting Fabric Capacities to Delta Lake process
2025-07-16 13:18:32,714 - INFO - Getting access token...
2025-07-16 13:18:33,652 - INFO - Successfully obtained access token
2025-07-16 13:18:33,653 - INFO - Retrieving capacities from Fabric API...
2025-07-16 13:18:33,653 - INFO - Making API call to: https://api.fabric.microsoft.com/v1/capacities (Attempt 1)
2025-07-16 13:18:39,451 - INFO - Retrieved 9 capacities. Total so far: 9
2025-07-16 13:18:39,451 - INFO - Finished retrieving capacities. Total count: 9
2025-07-16 13:18:39,452 - INFO - Creating enhanced PySpark DataFrame...
2025-07-16 13:18:51,511 - INFO - Delta table 'fabric_capacities' already exists
2025-07-16 13:18:51,511 - INFO - Starting merge operation for fabric_capacities
2025-07-16 13:19:05,186 - INFO - Merge operation completed successfully
2025-07-16 13:19:05,187 - INFO - Optimizing Delta table 'fabric_capacities'
2025-07-16 13:19:14,777 - INFO - Total rows in fabric_capacities: 9


+------------------------------------+---------------------------------+---+----------------+--------+--------------------------+
|id                                  |displayName                      |sku|region          |state   |extraction_timestamp      |
+------------------------------------+---------------------------------+---+----------------+--------+--------------------------+
|c73a5223-9ef6-4514-83cc-3e70297ee377|MDA Institutional Capacity - PROD|P1 |West US         |Active  |2025-07-16 13:18:40.648411|
|250aef2d-b24b-43a8-8564-8fefc5152522|f8nonprodsouthcentral001         |F8 |South Central US|Inactive|2025-07-16 13:18:40.648411|
|ab3b62c5-cff1-4341-a584-4ef86a529e8a|f64nonprodsouthcentral001        |F64|South Central US|Inactive|2025-07-16 13:18:40.648411|
|d94bc350-4bb9-4f24-9d89-fd633996eb28|f64x002                          |F64|South Central US|Active  |2025-07-16 13:18:40.648411|
|56125c55-2f69-4fa3-bac0-e9407fc17374|f64x001                          |F64|South Central 

'\nMAINTENANCE AND BEST PRACTICES:\n\n1. SCHEDULED UPDATES:\n   - Schedule this notebook to run periodically (e.g., daily/hourly)\n   - Use Fabric pipelines or scheduling features\n   - Consider incremental updates for large datasets\n\n2. DELTA LAKE MAINTENANCE:\n   - Run VACUUM periodically to clean old files:\n     spark.sql(f"VACUUM {table_name} RETAIN 168 HOURS")\n   - Monitor file sizes and compaction needs\n   - Review Z-ORDER columns based on query patterns\n\n3. MONITORING:\n   - Set up alerts for data quality issues\n   - Monitor table growth and storage usage\n   - Track query performance in Power BI\n\n4. POWER BI INTEGRATION:\n   - Use DirectQuery mode for real-time data\n   - Create relationships in Power BI data model\n   - Optimize DAX measures for performance\n\n5. ACCESS CONTROL:\n   - Set appropriate permissions on the Delta table\n   - Use row-level security if needed\n   - Audit access to sensitive capacity data\n\n6. PERFORMANCE TIPS:\n   - Use Z-ordering for freq

In [3]:
from pyspark.sql import SparkSession

# create Spark session
spark = SparkSession.builder.appName("Refresh SQL Endpoint Metadata").getOrCreate()

# refresh the specific table
spark.sql("REFRESH TABLE fabric_capacities")
print("Metadata refresh triggered successfully.")


StatementMeta(, aa104a35-c8a8-4a1c-916b-bc3ca5880acb, 5, Finished, Available, Finished)

Metadata refresh triggered successfully.
