In [1]:
# DO NOT DELETE THIS CELL

# API Name: Gateways - List Gateways
# Command:  GET https://api.fabric.microsoft.com/v1/gateways
# Doc:      https://learn.microsoft.com/en-us/rest/api/fabric/core/gateways/list-gateways

# Loads table: fabric_onprem_gateways

StatementMeta(, 3e21ce52-3042-45fe-9271-a9741d2a5f91, 3, Finished, Available, Finished)

In [2]:
# CELL 1 - Title and Introduction
# ==================================
# Microsoft Fabric Gateways to Delta Lake - PySpark Notebook
# This notebook retrieves Microsoft Fabric on-premises gateways and loads them into a Delta Lake table
# with optimization for analytics workloads
# ==================================


# CELL 2 - Import Libraries
# ==================================
# Import required libraries
import requests
import json
import pandas as pd
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, current_timestamp, lit, from_json
from pyspark.sql.types import StructType, StructField, StringType, TimestampType, BooleanType, IntegerType
import logging
from typing import Dict, List, Optional
from delta.tables import DeltaTable
# ==================================


# CELL 3 - Configure Logging and Initialize Spark
# ==================================
# Configure logging
# This helps us track what's happening in our code and debug issues
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)

# Initialize Spark Session with Delta Lake configurations
# In Fabric notebooks, Spark is pre-configured with Delta support
spark = SparkSession.builder \
    .appName("FabricGatewaysToDelta") \
    .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension") \
    .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog") \
    .getOrCreate()

# Set optimal configurations for Delta operations
spark.conf.set("spark.databricks.delta.optimizeWrite.enabled", "true")
spark.conf.set("spark.databricks.delta.autoCompact.enabled", "true")
# ==================================


# CELL 4 - Configuration Parameters
# ==================================
# Configuration Parameters
# These are the settings we'll use throughout the notebook
CONFIG = {
    "API_BASE_URL": "https://api.fabric.microsoft.com/v1",
    "MAX_RETRIES": 3,
    "PAGE_SIZE": 100,  # Number of items per page for API calls
    "TIMEOUT": 30,  # API request timeout in seconds
    "DELTA_TABLE_NAME": "fabric_onprem_gateways",  # Name of the Delta table
    "LAKEHOUSE_PATH": "Tables"  # Default Tables folder in Fabric Lakehouse
}
# ==================================


# CELL 5 - Authentication Function
# ==================================
def get_access_token():
    """
    Get Azure AD access token for Fabric API authentication.
    
    In a Fabric notebook, the token is automatically available through mssparkutils.
    This function retrieves the token that's needed to authenticate with the Fabric REST API.
    
    Returns:
        str: The access token
    
    Note:
        mssparkutils is a utility library provided by Microsoft Fabric
        that handles authentication automatically.
    """
    try:
        # In Fabric notebooks, we can get the token using mssparkutils
        from notebookutils import mssparkutils
        token_response = mssparkutils.credentials.getToken("https://api.fabric.microsoft.com")
        return token_response
    except Exception as e:
        logger.error(f"Failed to get access token: {str(e)}")
        raise
# ==================================


# CELL 6 - API Call Function
# ==================================
def call_fabric_api(endpoint: str, access_token: str, params: Optional[Dict] = None) -> Dict:
    """
    Make a REST API call to Microsoft Fabric.
    
    This function handles the HTTP request to the Fabric API, including:
    - Setting up authentication headers
    - Managing retries if the request fails
    - Error handling
    
    Args:
        endpoint: The API endpoint path (e.g., "/gateways")
        access_token: The Azure AD access token
        params: Optional query parameters for the API call
    
    Returns:
        dict: The JSON response from the API
    
    Raises:
        requests.exceptions.RequestException: If the API call fails after all retries
    """
    url = f"{CONFIG['API_BASE_URL']}{endpoint}"
    headers = {
        "Authorization": f"Bearer {access_token}",
        "Content-Type": "application/json"
    }
    
    # Retry logic - sometimes API calls can fail temporarily
    for attempt in range(CONFIG['MAX_RETRIES']):
        try:
            logger.info(f"Making API call to: {url} (Attempt {attempt + 1})")
            
            response = requests.get(
                url,
                headers=headers,
                params=params,
                timeout=CONFIG['TIMEOUT']
            )
            
            # Check if the request was successful
            response.raise_for_status()
            
            return response.json()
            
        except requests.exceptions.RequestException as e:
            logger.warning(f"API call failed (Attempt {attempt + 1}): {str(e)}")
            
            if attempt == CONFIG['MAX_RETRIES'] - 1:
                logger.error(f"All retry attempts failed for endpoint: {endpoint}")
                raise
            
            # Wait before retrying (exponential backoff)
            import time
            time.sleep(2 ** attempt)
# ==================================


# CELL 7 - Get All On-Premises Gateways Function
# ==================================
def get_all_onpremises_gateways(access_token: str) -> List[Dict]:
    """
    Retrieve all Microsoft Fabric on-premises gateways, handling pagination if necessary.
    
    The Fabric API may return results in pages if there are many gateways.
    This function handles the pagination automatically to get all on-premises gateways.
    
    Args:
        access_token: The Azure AD access token
    
    Returns:
        list: A list of all on-premises gateway objects
    """
    all_gateways = []
    continuation_token = None
    
    while True:
        # Set up parameters for the API call
        params = {"top": CONFIG['PAGE_SIZE']}
        if continuation_token:
            params["continuationToken"] = continuation_token
        
        # Call the API
        response = call_fabric_api("/gateways", access_token, params)
        
        # Extract gateways from the response
        gateways = response.get("value", [])
        
        # Filter for only OnPremises type gateways
        onpremises_gateways = [gateway for gateway in gateways if gateway.get("type") == "OnPremises"]
        all_gateways.extend(onpremises_gateways)
        
        logger.info(f"Retrieved {len(onpremises_gateways)} on-premises gateways out of {len(gateways)} total. Running total: {len(all_gateways)}")
        
        # Check if there are more pages
        continuation_token = response.get("continuationToken")
        if not continuation_token:
            break
    
    logger.info(f"Finished retrieving on-premises gateways. Total count: {len(all_gateways)}")
    return all_gateways
# ==================================


# CELL 8 - Create Enhanced DataFrame Function
# ==================================
def create_enhanced_gateways_dataframe(gateways_data: List[Dict]) -> "DataFrame":
    """
    Convert the gateways data into an enhanced PySpark DataFrame for Delta Lake.
    
    This function:
    - Creates a structured DataFrame with the gateway data
    - Extracts only the required fields as specified
    - Adds metadata columns for tracking
    - Prepares the data for optimal Delta Lake storage
    
    Args:
        gateways_data: List of gateway dictionaries from the API
    
    Returns:
        DataFrame: An enhanced PySpark DataFrame ready for Delta Lake
    """
    # Extract only the fields we need from each gateway
    simplified_gateways = []
    
    for gateway in gateways_data:
        simplified_gateway = {
            "id": gateway.get("id"),
            "displayName": gateway.get("displayName"),
            "type": gateway.get("type"),
            "version": gateway.get("version"),
            "allowCloudConnectionRefresh": gateway.get("allowCloudConnectionRefresh"),
            "allowCustomConnectors": gateway.get("allowCustomConnectors"),
            "numberOfMemberGateways": gateway.get("numberOfMemberGateways"),
            # Handle loadBalancingSetting as a string since it's an object type
            "loadBalancingSetting": json.dumps(gateway.get("loadBalancingSetting")) if gateway.get("loadBalancingSetting") else None
        }
        simplified_gateways.append(simplified_gateway)
    
    # Define the schema with the specific fields we need
    schema = StructType([
        StructField("id", StringType(), False),  # False = not nullable
        StructField("displayName", StringType(), True),
        StructField("type", StringType(), True),
        StructField("version", StringType(), True),
        StructField("allowCloudConnectionRefresh", BooleanType(), True),
        StructField("allowCustomConnectors", BooleanType(), True),
        StructField("numberOfMemberGateways", IntegerType(), True),
        StructField("loadBalancingSetting", StringType(), True),
        StructField("extraction_timestamp", TimestampType(), False)
    ])
    
    # Convert the data to a DataFrame
    if not simplified_gateways:
        logger.warning("No on-premises gateways found. Creating empty DataFrame.")
        # Create an empty DataFrame with the schema
        empty_rdd = spark.sparkContext.emptyRDD()
        enhanced_df = spark.createDataFrame(empty_rdd, schema)
        return enhanced_df
    
    # Convert to pandas DataFrame first
    pandas_df = pd.DataFrame(simplified_gateways)
    
    # Create the initial Spark DataFrame
    # We don't include extraction_timestamp here as we'll add it next
    required_columns = ["id", "displayName", "type", "version", 
                        "allowCloudConnectionRefresh", "allowCustomConnectors", 
                        "numberOfMemberGateways", "loadBalancingSetting"]
    
    # Ensure all columns exist in the pandas DataFrame
    for col_name in required_columns:
        if col_name not in pandas_df.columns:
            pandas_df[col_name] = None
    
    spark_df = spark.createDataFrame(pandas_df[required_columns])
    
    # Add metadata column for tracking when this data was extracted
    enhanced_df = spark_df.withColumn("extraction_timestamp", current_timestamp())
    
    return enhanced_df
# ==================================


# CELL 9 - Delta Lake Operations Functions
# ==================================
def ensure_delta_table_exists(table_name: str, df_schema):
    """
    Ensure the Delta table exists, creating it if necessary.
    
    Args:
        table_name: Name of the Delta table
        df_schema: Schema of the DataFrame
    """
    try:
        # Check if table exists
        spark.sql(f"DESCRIBE TABLE {table_name}")
        logger.info(f"Delta table '{table_name}' already exists")
    except Exception:
        # Table doesn't exist, create it
        logger.info(f"Creating Delta table '{table_name}'")
        
        # Create an empty DataFrame with the schema
        empty_df = spark.createDataFrame([], df_schema)
        
        # Create the Delta table (without partitioning)
        empty_df.write \
            .mode("overwrite") \
            .option("overwriteSchema", "true") \
            .saveAsTable(table_name)
        
        logger.info(f"Delta table '{table_name}' created successfully")


def merge_data_to_delta(source_df, table_name: str):
    """
    Merge new gateway data into the Delta table using MERGE operation.
    
    This function performs an upsert operation:
    - Updates existing records if gateway ID matches
    - Inserts new records if gateway ID doesn't exist
    
    Args:
        source_df: DataFrame with new data
        table_name: Name of the target Delta table
    """
    logger.info(f"Starting merge operation for {table_name}")
    
    # Create a temporary view for the merge operation
    source_df.createOrReplaceTempView("gateway_updates")
    
    # If the table is empty, just insert all records
    if spark.table(table_name).count() == 0:
        logger.info(f"Table {table_name} is empty. Inserting all records.")
        source_df.write.mode("append").saveAsTable(table_name)
        return
    
    # Perform the merge operation
    merge_query = f"""
    MERGE INTO {table_name} AS target
    USING gateway_updates AS source
    ON target.id = source.id
    WHEN MATCHED THEN
        UPDATE SET 
            target.displayName = source.displayName,
            target.type = source.type,
            target.version = source.version,
            target.allowCloudConnectionRefresh = source.allowCloudConnectionRefresh,
            target.allowCustomConnectors = source.allowCustomConnectors,
            target.numberOfMemberGateways = source.numberOfMemberGateways,
            target.loadBalancingSetting = source.loadBalancingSetting,
            target.extraction_timestamp = source.extraction_timestamp
    WHEN NOT MATCHED THEN
        INSERT *
    """
    
    spark.sql(merge_query)
    logger.info("Merge operation completed successfully")


def optimize_delta_table(table_name: str):
    """
    Optimize the Delta table for better query performance.
    
    This function:
    - Compacts small files (reduces file count)
    - Z-orders data by commonly queried columns
    - Updates table statistics
    
    Args:
        table_name: Name of the Delta table to optimize
    """
    logger.info(f"Optimizing Delta table '{table_name}'")
    
    # Compact files and Z-order by commonly queried columns
    spark.sql(f"""
        OPTIMIZE {table_name}
        ZORDER BY (type, allowCloudConnectionRefresh, allowCustomConnectors)
    """)
    
    # Update table statistics for better query planning
    spark.sql(f"ANALYZE TABLE {table_name} COMPUTE STATISTICS")
    
    logger.info("Delta table optimization completed")
# ==================================


# CELL 10 - Main Execution Function
# ==================================
def main():
    """
    Main execution function that orchestrates the entire process.
    
    This function:
    1. Gets the authentication token
    2. Retrieves all on-premises gateways from the API
    3. Creates an enhanced PySpark DataFrame
    4. Loads data into a Delta Lake table
    5. Optimizes the table for analytics
    """
    try:
        logger.info("Starting Fabric On-Premises Gateways to Delta Lake process")
        
        # Step 1: Get authentication token
        logger.info("Getting access token...")
        access_token = get_access_token()
        logger.info("Successfully obtained access token")
        
        # Step 2: Retrieve all on-premises gateways
        logger.info("Retrieving on-premises gateways from Fabric API...")
        gateways_data = get_all_onpremises_gateways(access_token)
        
        if not gateways_data:
            logger.warning("No on-premises gateways found")
            # Create empty dataframe with schema for consistent table structure
            empty_schema = StructType([
                StructField("id", StringType(), False),
                StructField("displayName", StringType(), True),
                StructField("type", StringType(), True),
                StructField("version", StringType(), True),
                StructField("allowCloudConnectionRefresh", BooleanType(), True),
                StructField("allowCustomConnectors", BooleanType(), True),
                StructField("numberOfMemberGateways", IntegerType(), True),
                StructField("loadBalancingSetting", StringType(), True),
                StructField("extraction_timestamp", TimestampType(), False)
            ])
            gateways_df = spark.createDataFrame([], empty_schema)
        else:
            # Step 3: Create enhanced PySpark DataFrame
            logger.info("Creating enhanced PySpark DataFrame...")
            gateways_df = create_enhanced_gateways_dataframe(gateways_data)
        
        # Show sample data
        logger.info("Sample of enhanced data:")
        gateways_df.show(5, truncate=False)
        
        # Step 4: Prepare Delta table
        table_name = CONFIG["DELTA_TABLE_NAME"]
        ensure_delta_table_exists(table_name, gateways_df.schema)
        
        # Step 5: Merge data into Delta table (if we have data)
        if gateways_data:
            merge_data_to_delta(gateways_df, table_name)
            
            # Step 6: Optimize the Delta table
            optimize_delta_table(table_name)
        
        # Step 7: Display final statistics
        logger.info("Loading completed successfully!")
        
        # Show table information
        spark.sql(f"DESCRIBE DETAIL {table_name}").show(truncate=False)
        
        # Show row count
        row_count = spark.table(table_name).count()
        logger.info(f"Total rows in {table_name}: {row_count}")
        
        # Show summary statistics
        summary_stats = spark.sql(f"""
            SELECT 
                COUNT(DISTINCT id) as unique_gateways,
                COUNT(DISTINCT version) as gateway_versions,
                SUM(numberOfMemberGateways) as total_member_gateways,
                MAX(extraction_timestamp) as last_updated
            FROM {table_name}
        """)
        
        logger.info("Summary statistics:")
        summary_stats.show(truncate=False)
        
        return gateways_df
        
    except Exception as e:
        logger.error(f"Error in main execution: {str(e)}")
        raise
# ==================================


# CELL 11 - Execute Main Function
# ==================================
# Execute the main function
if __name__ == "__main__":
    gateways_df = main()
# ==================================


# CELL 12 - Maintenance and Best Practices
# ==================================
"""
MAINTENANCE AND BEST PRACTICES:

1. SCHEDULED UPDATES:
   - Schedule this notebook to run periodically (e.g., daily/weekly)
   - Use Fabric pipelines or scheduling features
   - Consider monitoring gateway changes for compliance/security

2. DELTA LAKE MAINTENANCE:
   - Run VACUUM periodically to clean old files:
     spark.sql(f"VACUUM {table_name} RETAIN 168 HOURS")
   - Monitor history retention and storage usage
   - Review Z-ORDER columns based on query patterns

3. MONITORING AND ALERTING:
   - Set up alerts for gateway changes or versions
   - Monitor for gateways with unusual settings
   - Track gateway counts and distributions

4. POWER BI INTEGRATION:
   - Create dashboards showing gateway distributions
   - Monitor gateway versions for outdated installations
   - Visualize gateway member counts 

5. DATA SECURITY:
   - Implement appropriate access controls on the Delta table
   - Consider sensitive information in gateway metadata
   - Document security implications of gateway settings

6. PERFORMANCE OPTIMIZATION:
   - Consider adding a date partition if data grows significantly
   - Create aggregated views for common analytics
   - Use databricks caching for frequently accessed data

Example maintenance query - Find outdated gateways:
```sql
SELECT 
  displayName, 
  version, 
  numberOfMemberGateways,
  extraction_timestamp
FROM Fabric_OnPrem_Gateways
WHERE version < '3.0.0'  -- Replace with current recommended version
ORDER BY numberOfMemberGateways DESC
```

7. ERROR RECOVERY:
   - Use Delta time travel for recovery:
     spark.read.option("versionAsOf", 1).table(table_name)
   - Implement logging for all gateway changes
   - Create snapshots before major gateway updates
"""
# ==================================


StatementMeta(, 3e21ce52-3042-45fe-9271-a9741d2a5f91, 4, Finished, Available, Finished)

2025-07-16 16:01:13,939 - INFO - Starting Fabric On-Premises Gateways to Delta Lake process
2025-07-16 16:01:13,940 - INFO - Getting access token...
2025-07-16 16:01:14,644 - INFO - Successfully obtained access token
2025-07-16 16:01:14,644 - INFO - Retrieving on-premises gateways from Fabric API...
2025-07-16 16:01:14,645 - INFO - Making API call to: https://api.fabric.microsoft.com/v1/gateways (Attempt 1)
2025-07-16 16:01:16,196 - INFO - Retrieved 14 on-premises gateways out of 14 total. Running total: 14
2025-07-16 16:01:16,197 - INFO - Finished retrieving on-premises gateways. Total count: 14
2025-07-16 16:01:16,197 - INFO - Creating enhanced PySpark DataFrame...
2025-07-16 16:01:17,372 - INFO - Sample of enhanced data:
2025-07-16 16:01:27,442 - INFO - Delta table 'fabric_onprem_gateways' already exists
2025-07-16 16:01:27,442 - INFO - Starting merge operation for fabric_onprem_gateways
2025-07-16 16:01:42,698 - INFO - Merge operation completed successfully
2025-07-16 16:01:42,699 

+------------------------------------+-------------------------------+----------+----------------------+---------------------------+---------------------+----------------------+--------------------+--------------------------+
|id                                  |displayName                    |type      |version               |allowCloudConnectionRefresh|allowCustomConnectors|numberOfMemberGateways|loadBalancingSetting|extraction_timestamp      |
+------------------------------------+-------------------------------+----------+----------------------+---------------------------+---------------------+----------------------+--------------------+--------------------------+
|211a986c-1e4a-4029-a064-126e534f40bb|Academic Analytics & Technology|OnPremises|3000.6.204+g6523bf7046|false                      |false                |1                     |"Failover"          |2025-07-16 16:01:17.390736|
|7505cd2d-8438-4fac-9b32-d43a1182d32e|mdaEnterprise                  |OnPremises|3000.45.7      

'\nMAINTENANCE AND BEST PRACTICES:\n\n1. SCHEDULED UPDATES:\n   - Schedule this notebook to run periodically (e.g., daily/weekly)\n   - Use Fabric pipelines or scheduling features\n   - Consider monitoring gateway changes for compliance/security\n\n2. DELTA LAKE MAINTENANCE:\n   - Run VACUUM periodically to clean old files:\n     spark.sql(f"VACUUM {table_name} RETAIN 168 HOURS")\n   - Monitor history retention and storage usage\n   - Review Z-ORDER columns based on query patterns\n\n3. MONITORING AND ALERTING:\n   - Set up alerts for gateway changes or versions\n   - Monitor for gateways with unusual settings\n   - Track gateway counts and distributions\n\n4. POWER BI INTEGRATION:\n   - Create dashboards showing gateway distributions\n   - Monitor gateway versions for outdated installations\n   - Visualize gateway member counts \n\n5. DATA SECURITY:\n   - Implement appropriate access controls on the Delta table\n   - Consider sensitive information in gateway metadata\n   - Document se

In [3]:
from pyspark.sql import SparkSession

# create Spark session
spark = SparkSession.builder.appName("Refresh SQL Endpoint Metadata").getOrCreate()

# refresh the specific table
spark.sql("REFRESH TABLE fabric_onprem_gateways")
print("Metadata refresh triggered successfully.")


StatementMeta(, 3e21ce52-3042-45fe-9271-a9741d2a5f91, 5, Finished, Available, Finished)

Metadata refresh triggered successfully.
