In [1]:
# DO NOT DELETE THIS CELL

# API Name: Gateways - Get Gateways
# Command:  GET https://api.powerbi.com/v1.0/myorg/gateways
# Doc:      https://learn.microsoft.com/en-us/rest/api/power-bi/gateways/get-gateways

# Loads table: pbi_gateways

StatementMeta(, afe84ec1-147d-4374-909d-9e60157c61a7, 3, Finished, Available, Finished)

In [2]:
# CELL 1 - Title and Introduction
# ==================================
# Power BI Gateways to Delta Lake - PySpark Notebook
# This notebook retrieves Power BI gateways and loads them into a Delta Lake table
# with optimization for analytics workloads
# ==================================


# CELL 2 - Import Libraries
# ==================================
# Import required libraries
import requests
import json
import pandas as pd
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, current_timestamp, lit, from_json, when, isnotnull
from pyspark.sql.types import StructType, StructField, StringType, TimestampType, BooleanType
import logging
from typing import Dict, List, Optional
from delta.tables import DeltaTable
import random
import time
# ==================================


# CELL 3 - Configure Logging and Initialize Spark
# ==================================
# Configure logging
# This helps us track what's happening in our code and debug issues
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)

# Initialize Spark Session with Delta Lake configurations
# In Fabric notebooks, Spark is pre-configured with Delta support
spark = SparkSession.builder \
    .appName("PowerBIGatewaysToDelta") \
    .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension") \
    .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog") \
    .getOrCreate()

# Set optimal configurations for Delta operations
spark.conf.set("spark.databricks.delta.optimizeWrite.enabled", "true")
spark.conf.set("spark.databricks.delta.autoCompact.enabled", "true")
# ==================================


# CELL 4 - Configuration Parameters
# ==================================
# Configuration Parameters
# These are the settings we'll use throughout the notebook
CONFIG = {
    "API_BASE_URL": "https://api.powerbi.com/v1.0/myorg",
    "GATEWAYS_ENDPOINT": "/gateways",  # Endpoint for listing gateways
    "MAX_RETRIES": 5,  # Number of retries for handling rate limits
    "INITIAL_BACKOFF_SEC": 1,  # Initial backoff time in seconds
    "MAX_BACKOFF_SEC": 60,  # Maximum backoff time in seconds
    "BACKOFF_FACTOR": 2,  # Exponential backoff multiplier
    "JITTER_FACTOR": 0.1,  # Random jitter to add to backoff (as a fraction)
    "TIMEOUT": 30,  # API request timeout in seconds
    "GATEWAYS_TABLE_NAME": "pbi_gateways",  # Name of the target Delta table
    "LAKEHOUSE_PATH": "Tables",  # Default Tables folder in Fabric Lakehouse
    "DEBUG_MODE": True  # Set to True to enable extra debugging output
}

# Power BI API uses standard OData pagination, not continuation tokens like Fabric API
# ==================================


# CELL 5 - Authentication Function
# ==================================
def get_access_token():
    """
    Get Azure AD access token for Power BI API authentication.
    
    In a Fabric notebook, the token is automatically available through mssparkutils.
    This function retrieves the token that's needed to authenticate with the Power BI REST API.
    
    Returns:
        str: The access token
    
    Note:
        mssparkutils is a utility library provided by Microsoft Fabric
        that handles authentication automatically.
        For Power BI API, we need the Power BI service scope.
    """
    try:
        # In Fabric notebooks, we can get the token using mssparkutils
        from notebookutils import mssparkutils
        # Power BI API requires the Power BI service scope
        token_response = mssparkutils.credentials.getToken("https://analysis.windows.net/powerbi/api")
        return token_response
    except Exception as e:
        logger.error(f"Failed to get access token: {str(e)}")
        raise
# ==================================


# CELL 6 - API Call Function
# ==================================
def call_powerbi_api(endpoint: str, access_token: str, params: Optional[Dict] = None) -> Dict:
    """
    Make a REST API call to Power BI with advanced rate limit handling.
    
    This function handles the HTTP request to the Power BI API, including:
    - Setting up authentication headers
    - Managing retries with intelligent backoff for rate limiting (429 errors)
    - Implementing jitter to avoid synchronized retries
    - Detailed error handling and logging
    
    Args:
        endpoint: The API endpoint path (e.g., "/gateways")
        access_token: The Azure AD access token
        params: Optional query parameters for the API call
    
    Returns:
        dict: The JSON response from the API
    
    Raises:
        requests.exceptions.RequestException: If the API call fails after all retries
    """
    url = f"{CONFIG['API_BASE_URL']}{endpoint}"
    headers = {
        "Authorization": f"Bearer {access_token}",
        "Content-Type": "application/json"
    }
    
    # Initialize backoff time
    backoff_time = CONFIG['INITIAL_BACKOFF_SEC']
    
    # Retry logic with intelligent backoff
    for attempt in range(CONFIG['MAX_RETRIES']):
        try:
            # Log the full URL with parameters for debugging
            logger.info(f"Making API call to: {url} with params: {params} (Attempt {attempt + 1})")
            
            response = requests.get(
                url,
                headers=headers,
                params=params,
                timeout=CONFIG['TIMEOUT']
            )
            
            # Log the response status for debugging
            logger.info(f"Response status: {response.status_code}")
            
            # Rate limit handling (429 Too Many Requests)
            if response.status_code == 429:
                # Get retry-after header if available, otherwise use our backoff
                retry_after = response.headers.get('Retry-After')
                
                if retry_after and retry_after.isdigit():
                    # If server specified a wait time, use it
                    wait_time = int(retry_after)
                else:
                    # Calculate wait time with exponential backoff and jitter
                    jitter = random.uniform(0, CONFIG['JITTER_FACTOR'] * backoff_time)
                    wait_time = backoff_time + jitter
                    # Update backoff for next attempt
                    backoff_time = min(backoff_time * CONFIG['BACKOFF_FACTOR'], CONFIG['MAX_BACKOFF_SEC'])
                
                logger.warning(f"Rate limit exceeded (429). Waiting {wait_time:.2f} seconds before retry.")
                time.sleep(wait_time)
                continue  # Skip to next retry attempt without raising exception
            
            # Log the response for debugging in case of errors
            if response.status_code >= 400:
                logger.error(f"API error: Status {response.status_code}, Response: {response.text}")
                logger.error(f"Request URL: {response.request.url}")
                logger.error(f"Request headers: {response.request.headers}")
            
            # For all other status codes, use raise_for_status to handle
            response.raise_for_status()
            
            # If we get here, the request was successful
            # Reset backoff for next API call (not next attempt)
            backoff_time = CONFIG['INITIAL_BACKOFF_SEC']
            
            # Log a sample of the response content for debugging
            try:
                response_json = response.json()
                if "value" in response_json and isinstance(response_json["value"], list):
                    logger.info(f"Response contains {len(response_json['value'])} items in 'value' array")
                return response_json
            except json.JSONDecodeError as e:
                logger.error(f"Failed to parse response as JSON: {str(e)}")
                logger.error(f"Response content: {response.text[:1000]}")  # Log first 1000 chars of response
                raise
            
        except requests.exceptions.RequestException as e:
            last_attempt = attempt == CONFIG['MAX_RETRIES'] - 1
            
            # Special handling for non-429 errors
            if not (hasattr(e, 'response') and e.response is not None and e.response.status_code == 429):
                logger.warning(f"API call failed (Attempt {attempt + 1}): {str(e)}")
                
                if last_attempt:
                    logger.error(f"All retry attempts failed for endpoint: {endpoint}")
                    logger.error(f"Final error: {str(e)}")
                    raise
                
                # Calculate wait time with exponential backoff and jitter
                jitter = random.uniform(0, CONFIG['JITTER_FACTOR'] * backoff_time)
                wait_time = backoff_time + jitter
                # Update backoff for next attempt
                backoff_time = min(backoff_time * CONFIG['BACKOFF_FACTOR'], CONFIG['MAX_BACKOFF_SEC'])
                
                logger.info(f"Waiting {wait_time:.2f} seconds before retry.")
                time.sleep(wait_time)
# ==================================


# CELL 7 - Get Gateways Function
# ==================================
def get_gateways(access_token: str) -> List[Dict]:
    """
    Retrieve all gateways from the Power BI API.
    
    This function makes a request to the Get Gateways API endpoint.
    Unlike some APIs, Power BI gateways API typically returns all results in one call
    since gateway counts are usually manageable (not requiring pagination).
    
    Args:
        access_token: The Azure AD access token
    
    Returns:
        list: A list of all gateway objects
    """
    try:
        logger.info("Retrieving gateways from Power BI API...")
        
        # Make the API call to get gateways
        # Power BI API returns gateways that the user is an admin for
        response_data = call_powerbi_api(CONFIG['GATEWAYS_ENDPOINT'], access_token)
        
        # Log the response structure for debugging
        if CONFIG['DEBUG_MODE']:
            logger.info(f"Response keys: {list(response_data.keys())}")
        
        # Extract gateways from the response
        # Power BI API returns data in "value" array (OData format)
        gateways = response_data.get("value", [])
        
        if gateways:
            logger.info(f"Retrieved {len(gateways)} gateways")
            
            # Log first gateway for debugging (with sensitive data masked)
            if CONFIG['DEBUG_MODE'] and gateways:
                sample_gateway = gateways[0].copy()
                # Mask sensitive data in logs
                if 'publicKey' in sample_gateway:
                    if isinstance(sample_gateway['publicKey'], dict):
                        if 'modulus' in sample_gateway['publicKey']:
                            sample_gateway['publicKey']['modulus'] = sample_gateway['publicKey']['modulus'][:20] + "..."
                logger.info(f"Sample gateway structure: {json.dumps(sample_gateway, indent=2)}")
        else:
            logger.warning("No gateways found - this could mean:")
            logger.warning("1. No gateways are configured in your Power BI tenant")
            logger.warning("2. You don't have admin permissions on any gateways")
            logger.warning("3. All gateways are VNet gateways (not supported by this API)")
        
        return gateways
        
    except requests.exceptions.RequestException as e:
        logger.error(f"Failed to retrieve gateways: {str(e)}")
        raise
    except Exception as e:
        logger.error(f"Unexpected error while retrieving gateways: {str(e)}")
        raise
# ==================================


# CELL 8 - Create Enhanced DataFrame Function
# ==================================
def create_enhanced_gateways_dataframe(gateways_data: List[Dict]) -> "DataFrame":
    """
    Convert the gateways data into an enhanced PySpark DataFrame for Delta Lake.
    
    This function:
    - Creates a structured DataFrame with the gateway data
    - Flattens the nested publicKey object into separate columns
    - Adds metadata columns for tracking
    - Derives useful fields like is_active status
    
    Args:
        gateways_data: List of gateway dictionaries from the API
    
    Returns:
        DataFrame: An enhanced PySpark DataFrame ready for Delta Lake
    """
    # Extract and flatten the gateway data with proper null handling
    flattened_gateways = []
    
    for gateway in gateways_data:
        # Extract the main gateway fields with explicit null handling
        flattened_gateway = {
            "id": gateway.get("id", ""),  # ID should never be null
            "name": gateway.get("name", ""),
            "type": gateway.get("type", "Unknown"),
            "gateway_status": gateway.get("gatewayStatus", "Unknown"),  # Provide default
            "gateway_annotation": gateway.get("gatewayAnnotation", "")
        }
        
        # Flatten the nested publicKey object
        public_key = gateway.get("publicKey", {})
        if isinstance(public_key, dict):
            flattened_gateway["public_key_exponent"] = public_key.get("exponent", "")
            flattened_gateway["public_key_modulus"] = public_key.get("modulus", "")
        else:
            flattened_gateway["public_key_exponent"] = ""
            flattened_gateway["public_key_modulus"] = ""
        
        flattened_gateways.append(flattened_gateway)
    
    # Define the schema for our Delta table
    schema = StructType([
        StructField("id", StringType(), False),                    # Primary key - not nullable
        StructField("name", StringType(), True),                  # Gateway name
        StructField("type", StringType(), True),                  # Gateway type
        StructField("gateway_status", StringType(), True),        # Current status
        StructField("gateway_annotation", StringType(), True),    # JSON metadata
        StructField("public_key_exponent", StringType(), True),   # Flattened public key
        StructField("public_key_modulus", StringType(), True),    # Flattened public key
        StructField("is_active", BooleanType(), False),           # Derived status field
        StructField("extraction_timestamp", TimestampType(), False) # ETL timestamp
    ])
    
    # Handle empty data case
    if not flattened_gateways:
        logger.warning("No gateways found. Creating empty DataFrame with proper schema.")
        empty_rdd = spark.sparkContext.emptyRDD()
        enhanced_df = spark.createDataFrame(empty_rdd, schema)
        return enhanced_df
    
    # Convert to pandas DataFrame first for easier handling
    pandas_df = pd.DataFrame(flattened_gateways)
    
    # Ensure all required columns exist with proper defaults
    required_columns = [
        "id", "name", "type", "gateway_status", "gateway_annotation",
        "public_key_exponent", "public_key_modulus"
    ]
    
    for col_name in required_columns:
        if col_name not in pandas_df.columns:
            pandas_df[col_name] = ""  # Use empty string instead of None
    
    # Fill any remaining NaN values with empty strings
    pandas_df = pandas_df.fillna("")
    
    # Validate that ID column is not empty
    if pandas_df['id'].isna().any() or (pandas_df['id'] == "").any():
        logger.error("Found gateways with missing or empty IDs")
        # Remove rows with empty IDs
        pandas_df = pandas_df[pandas_df['id'] != ""]
        pandas_df = pandas_df.dropna(subset=['id'])
        logger.warning(f"Removed gateways with missing IDs. Remaining: {len(pandas_df)}")
    
    # Create the initial Spark DataFrame with explicit type casting
    spark_df = spark.createDataFrame(pandas_df[required_columns])
    
    # Cast all columns to proper string types and handle nulls
    typed_df = spark_df \
        .withColumn("id", col("id").cast(StringType())) \
        .withColumn("name", col("name").cast(StringType())) \
        .withColumn("type", col("type").cast(StringType())) \
        .withColumn("gateway_status", col("gateway_status").cast(StringType())) \
        .withColumn("gateway_annotation", col("gateway_annotation").cast(StringType())) \
        .withColumn("public_key_exponent", col("public_key_exponent").cast(StringType())) \
        .withColumn("public_key_modulus", col("public_key_modulus").cast(StringType()))
    
    # Add derived and metadata columns
    enhanced_df = typed_df \
        .withColumn("is_active", 
                   when(col("gateway_status").isin(["Live", "Online"]), True)
                   .otherwise(False)) \
        .withColumn("extraction_timestamp", current_timestamp())
    
    # Ensure column order matches our schema exactly
    final_df = enhanced_df.select(
        "id", 
        "name", 
        "type", 
        "gateway_status", 
        "gateway_annotation", 
        "public_key_exponent", 
        "public_key_modulus", 
        "is_active", 
        "extraction_timestamp"
    )
    
    # Log DataFrame info for debugging
    if CONFIG['DEBUG_MODE']:
        logger.info(f"Created DataFrame with {final_df.count()} rows")
        logger.info("DataFrame schema:")
        final_df.printSchema()
    
    return final_df
# ==================================


# CELL 9 - Delta Lake Operations Functions
# ==================================
def ensure_delta_table_exists(table_name: str, df_schema):
    """
    Ensure the Delta table exists, creating it if necessary.
    
    This function checks if our target Delta table exists in the Lakehouse.
    If it doesn't exist, it creates a new table with the proper schema.
    
    Args:
        table_name: Name of the Delta table (e.g., "pbi_gateways")
        df_schema: Schema of the DataFrame to match table structure
    """
    try:
        # Check if table exists by trying to describe it
        spark.sql(f"DESCRIBE TABLE {table_name}")
        logger.info(f"Delta table '{table_name}' already exists")
    except Exception:
        # Table doesn't exist, create it
        logger.info(f"Creating new Delta table '{table_name}'")
        
        # Create an empty DataFrame with the required schema
        empty_df = spark.createDataFrame([], df_schema)
        
        # Create the Delta table
        empty_df.write \
            .mode("overwrite") \
            .option("overwriteSchema", "true") \
            .saveAsTable(table_name)
        
        logger.info(f"Delta table '{table_name}' created successfully")


def merge_data_to_delta(source_df, table_name: str):
    """
    Merge new gateway data into the Delta table using MERGE operation.
    
    This function performs an upsert operation:
    - Updates existing records if gateway ID matches (gateway info might change)
    - Inserts new records if gateway ID doesn't exist (new gateways added)
    - Handles schema evolution when column types need to change
    
    Args:
        source_df: DataFrame with new gateway data
        table_name: Name of the target Delta table
    """
    logger.info(f"Starting merge operation for {table_name}")
    
    # Validate the source DataFrame before merge
    logger.info("Validating source DataFrame...")
    source_df.printSchema()
    
    # Show sample data for debugging
    logger.info("Sample source data:")
    source_df.show(3, truncate=False)
    
    # Check existing table schema
    existing_count = spark.table(table_name).count()
    if existing_count == 0:
        logger.info(f"Table {table_name} is empty. Inserting all records.")
        source_df.write.mode("append").saveAsTable(table_name)
        logger.info(f"Inserted {source_df.count()} new gateway records")
        return
    
    # Check for schema compatibility issues
    logger.info("Checking existing table schema...")
    existing_schema = spark.table(table_name).schema
    logger.info("Existing table schema:")
    for field in existing_schema.fields:
        logger.info(f"  {field.name}: {field.dataType}")
    
    # Check if we have NullType columns that need schema evolution
    null_type_columns = []
    for field in existing_schema.fields:
        if str(field.dataType) == "NullType" or "NullType" in str(field.dataType):
            null_type_columns.append(field.name)
    
    # Also check for incompatible types by comparing schemas
    schema_mismatch = False
    source_schema = source_df.schema
    
    for source_field in source_schema.fields:
        existing_field = None
        for field in existing_schema.fields:
            if field.name == source_field.name:
                existing_field = field
                break
        
        if existing_field and str(existing_field.dataType) != str(source_field.dataType):
            logger.warning(f"Schema mismatch for column '{source_field.name}': "
                         f"existing={existing_field.dataType}, source={source_field.dataType}")
            schema_mismatch = True
    
    if null_type_columns or schema_mismatch:
        logger.warning(f"Found incompatible schema that needs evolution.")
        logger.warning(f"NullType columns: {null_type_columns}")
        logger.info("Performing schema evolution by recreating the table...")
        
        # Read and preserve existing data BEFORE dropping the table
        existing_data = spark.table(table_name)
        existing_count = existing_data.count()
        
        # Convert existing data to list BEFORE dropping table
        preserved_data_list = []
        if existing_count > 0:
            logger.info(f"Preserving {existing_count} existing records during schema evolution...")
            
            # Get the source DataFrame column names and types for mapping
            source_columns = {field.name: field.dataType for field in source_df.schema.fields}
            
            # Collect existing data before dropping table
            for row in existing_data.collect():
                preserved_row = {}
                for col_name in source_columns.keys():
                    if col_name in existing_data.columns:
                        # Get the value from existing data
                        value = row[col_name]
                        # Handle null values properly
                        if value is None or (isinstance(value, str) and value.strip() == ""):
                            if str(source_columns[col_name]) == "StringType()":
                                preserved_row[col_name] = ""
                            elif str(source_columns[col_name]) == "BooleanType()":
                                preserved_row[col_name] = False
                            else:
                                preserved_row[col_name] = None
                        else:
                            preserved_row[col_name] = value
                    else:
                        # Column doesn't exist in old data, set default
                        if str(source_columns[col_name]) == "StringType()":
                            preserved_row[col_name] = ""
                        elif str(source_columns[col_name]) == "BooleanType()":
                            preserved_row[col_name] = False
                        else:
                            preserved_row[col_name] = None
                
                preserved_data_list.append(preserved_row)
        
        # Now drop the existing table
        logger.info(f"Dropping existing table {table_name} for schema evolution...")
        spark.sql(f"DROP TABLE IF EXISTS {table_name}")
        
        # Create the combined data
        if preserved_data_list:
            logger.info(f"Creating DataFrame from {len(preserved_data_list)} preserved records...")
            preserved_df = spark.createDataFrame(preserved_data_list, source_df.schema)
            # Union preserved data with new data
            combined_data = preserved_df.union(source_df)
        else:
            logger.info("No existing data to preserve, using only new data...")
            combined_data = source_df
        
        # Create the new table with the correct schema
        logger.info("Creating new table with correct schema...")
        combined_data.write \
            .mode("overwrite") \
            .option("overwriteSchema", "true") \
            .saveAsTable(table_name)
        
        logger.info("Schema evolution completed successfully")
        
    else:
        # Normal merge operation when schemas are compatible
        logger.info("Schemas are compatible. Proceeding with normal merge...")
        
        # Create a temporary view for the merge operation
        source_df.createOrReplaceTempView("gateway_updates")
        
        # Perform the merge operation
        merge_query = f"""
        MERGE INTO {table_name} AS target
        USING gateway_updates AS source
        ON target.id = source.id
        WHEN MATCHED THEN
            UPDATE SET 
                target.name = source.name,
                target.type = source.type,
                target.gateway_status = source.gateway_status,
                target.gateway_annotation = source.gateway_annotation,
                target.public_key_exponent = source.public_key_exponent,
                target.public_key_modulus = source.public_key_modulus,
                target.is_active = source.is_active,
                target.extraction_timestamp = source.extraction_timestamp
        WHEN NOT MATCHED THEN
            INSERT (id, name, type, gateway_status, gateway_annotation, 
                    public_key_exponent, public_key_modulus, is_active, extraction_timestamp)
            VALUES (source.id, source.name, source.type, source.gateway_status, 
                    source.gateway_annotation, source.public_key_exponent, 
                    source.public_key_modulus, source.is_active, source.extraction_timestamp)
        """
        
        # Execute the merge
        spark.sql(merge_query)
    
    # Log merge statistics
    updated_count = spark.table(table_name).count()
    logger.info(f"Merge operation completed successfully")
    logger.info(f"Total records in table after merge: {updated_count}")


def optimize_delta_table(table_name: str):
    """
    Optimize the Delta table for better query performance.
    
    This function performs optimization tasks suitable for Microsoft Fabric:
    - Updates table statistics for better query planning
    - Uses Fabric-compatible optimization methods
    
    Args:
        table_name: Name of the Delta table to optimize
    """
    logger.info(f"Optimizing Delta table '{table_name}'")
    
    try:
        # Update table statistics for the query optimizer
        # This helps Spark make better decisions about query execution plans
        spark.sql(f"ANALYZE TABLE {table_name} COMPUTE STATISTICS")
        logger.info("Table statistics updated successfully")
        
        # In Microsoft Fabric, many Delta optimizations are handled automatically
        # We'll focus on what we can control directly
        
        # Get table details for monitoring
        delta_table = DeltaTable.forName(spark, table_name)
        table_detail = spark.sql(f"DESCRIBE DETAIL {table_name}")
        
        logger.info("Delta table optimization completed")
        logger.info("Note: Microsoft Fabric handles many optimizations automatically")
        
        # Show some optimization info
        if CONFIG['DEBUG_MODE']:
            logger.info("Table details after optimization:")
            table_detail.select("format", "numFiles", "sizeInBytes", "partitionColumns").show(truncate=False)
            
    except Exception as e:
        logger.warning(f"Table optimization encountered an issue: {str(e)}")
        logger.info("Continuing - optimization issues don't affect core functionality")
# ==================================


# CELL 10 - Main Execution Function
# ==================================
def main():
    """
    Main execution function that orchestrates the entire Power BI gateways ETL process.
    
    This function:
    1. Authenticates with Azure AD to get Power BI API access
    2. Retrieves all gateways that the user is an admin for
    3. Creates an enhanced PySpark DataFrame with flattened gateway data
    4. Loads data into a Delta Lake table with upsert logic
    5. Optimizes the table for analytics performance
    6. Provides summary statistics and data quality insights
    """
    try:
        logger.info("Starting Power BI Gateways to Delta Lake ETL process")
        
        # Step 1: Authentication
        logger.info("Step 1: Getting Power BI API access token...")
        access_token = get_access_token()
        logger.info("✓ Successfully obtained access token")
        
        # Step 2: Data Extraction
        logger.info("Step 2: Retrieving gateways from Power BI API...")
        gateways_data = get_gateways(access_token)
        
        # Handle the case where no gateways are found
        if not gateways_data:
            logger.warning("⚠ No gateways found. This might be expected if:")
            logger.warning("  - No gateways are configured in your organization")
            logger.warning("  - You don't have admin permissions on any gateways")
            logger.warning("  - Only VNet gateways exist (not supported by this API)")
            
            # Still create the table structure for consistency
            empty_schema = StructType([
                StructField("id", StringType(), False),
                StructField("name", StringType(), True),
                StructField("type", StringType(), True),
                StructField("gateway_status", StringType(), True),
                StructField("gateway_annotation", StringType(), True),
                StructField("public_key_exponent", StringType(), True),
                StructField("public_key_modulus", StringType(), True),
                StructField("is_active", BooleanType(), False),
                StructField("extraction_timestamp", TimestampType(), False)
            ])
            gateways_df = spark.createDataFrame([], empty_schema)
        else:
            logger.info(f"✓ Successfully retrieved {len(gateways_data)} gateways")
            
            # Step 3: Data Transformation
            logger.info("Step 3: Creating enhanced DataFrame with flattened structure...")
            gateways_df = create_enhanced_gateways_dataframe(gateways_data)
            logger.info("✓ DataFrame created successfully")
        
        # Show sample of the processed data
        logger.info("Sample of processed gateway data:")
        gateways_df.show(5, truncate=False)
        
        # Step 4: Data Loading - Prepare Delta table
        table_name = CONFIG["GATEWAYS_TABLE_NAME"]
        logger.info(f"Step 4: Preparing Delta table '{table_name}'...")
        ensure_delta_table_exists(table_name, gateways_df.schema)
        
        # Step 5: Data Loading - Merge data (if we have data)
        if gateways_data:
            logger.info("Step 5: Merging data into Delta table...")
            merge_data_to_delta(gateways_df, table_name)
            logger.info("✓ Data merge completed successfully")
            
            # Step 6: Optimization
            logger.info("Step 6: Optimizing Delta table...")
            optimize_delta_table(table_name)
            logger.info("✓ Table optimization completed")
        else:
            logger.info("Step 5-6: Skipping merge and optimization (no data to process)")
        
        # Step 7: Results and Analytics
        logger.info("Step 7: Generating summary report...")
        
        # Show table information
        logger.info("=== TABLE INFORMATION ===")
        table_details = spark.sql(f"DESCRIBE DETAIL {table_name}")
        table_details.select("format", "numFiles", "sizeInBytes").show(truncate=False)
        
        # Show row count
        row_count = spark.table(table_name).count()
        logger.info(f"📊 Total gateways in table: {row_count}")
        
        # Show summary statistics (if we have data)
        if row_count > 0:
            logger.info("=== GATEWAY ANALYTICS ===")
            
            summary_stats = spark.sql(f"""
                SELECT 
                    COUNT(*) as total_gateways,
                    COUNT(DISTINCT type) as gateway_types,
                    COUNT(DISTINCT gateway_status) as status_types,
                    SUM(CASE WHEN is_active = true THEN 1 ELSE 0 END) as active_gateways,
                    MAX(extraction_timestamp) as last_updated
                FROM {table_name}
            """)
            
            logger.info("📈 Summary Statistics:")
            summary_stats.show(truncate=False)
            
            # Gateway distribution by type
            type_distribution = spark.sql(f"""
                SELECT 
                    COALESCE(type, 'Unknown') as gateway_type,
                    COUNT(*) as count,
                    ROUND(COUNT(*) * 100.0 / SUM(COUNT(*)) OVER(), 2) as percentage
                FROM {table_name}
                GROUP BY type
                ORDER BY count DESC
            """)
            
            logger.info("🔧 Gateway Distribution by Type:")
            type_distribution.show(truncate=False)
            
            # Gateway status distribution
            status_distribution = spark.sql(f"""
                SELECT 
                    COALESCE(gateway_status, 'Unknown') as status,
                    COUNT(*) as count,
                    ROUND(COUNT(*) * 100.0 / SUM(COUNT(*)) OVER(), 2) as percentage
                FROM {table_name}
                GROUP BY gateway_status
                ORDER BY count DESC
            """)
            
            logger.info("📡 Gateway Status Distribution:")
            status_distribution.show(truncate=False)
            
            # Activity summary with better labeling
            activity_summary = spark.sql(f"""
                SELECT 
                    CASE 
                        WHEN is_active = true THEN 'Active'
                        ELSE 'Inactive'
                    END as status,
                    COUNT(*) as count
                FROM {table_name}
                GROUP BY is_active
                ORDER BY is_active DESC
            """)
            
            logger.info("⚡ Gateway Activity Summary:")
            activity_summary.show(truncate=False)
        
        logger.info("🎉 ETL process completed successfully!")
        logger.info(f"✅ Gateway data is now available in the '{table_name}' table")
        logger.info("💡 You can now use this data for Power BI reports and analytics")
        
        return gateways_df
        
    except Exception as e:
        logger.error(f"❌ Error in main execution: {str(e)}")
        logger.error("🔍 Check the logs above for detailed error information")
        raise
# ==================================


# CELL 11 - Execute Main Function
# ==================================
# Execute the main function
# This is the entry point that starts the entire ETL process
if __name__ == "__main__":
    try:
        logger.info("🚀 Starting Power BI Gateways ETL Pipeline")
        gateways_df = main()
        logger.info("✨ Pipeline completed successfully!")
    except Exception as e:
        logger.error(f"💥 Pipeline failed: {str(e)}")
        raise
# ==================================

StatementMeta(, afe84ec1-147d-4374-909d-9e60157c61a7, 4, Finished, Available, Finished)

root
 |-- id: string (nullable = true)
 |-- name: string (nullable = true)
 |-- type: string (nullable = true)
 |-- gateway_status: string (nullable = true)
 |-- gateway_annotation: string (nullable = true)
 |-- public_key_exponent: string (nullable = true)
 |-- public_key_modulus: string (nullable = true)
 |-- is_active: boolean (nullable = false)
 |-- extraction_timestamp: timestamp (nullable = false)

+------------------------------------+-------------------------------+--------+--------------+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

2025-07-16 19:02:21,590 - INFO - Checking existing table schema...
2025-07-16 19:02:21,783 - INFO - Existing table schema:
2025-07-16 19:02:21,784 - INFO -   id: StringType()
2025-07-16 19:02:21,784 - INFO -   name: StringType()
2025-07-16 19:02:21,785 - INFO -   type: StringType()
2025-07-16 19:02:21,785 - INFO -   gateway_status: StringType()
2025-07-16 19:02:21,786 - INFO -   gateway_annotation: StringType()
2025-07-16 19:02:21,786 - INFO -   public_key_exponent: StringType()
2025-07-16 19:02:21,787 - INFO -   public_key_modulus: StringType()
2025-07-16 19:02:21,787 - INFO -   is_active: BooleanType()
2025-07-16 19:02:21,788 - INFO -   extraction_timestamp: TimestampType()
2025-07-16 19:02:21,789 - INFO - Schemas are compatible. Proceeding with normal merge...
2025-07-16 19:02:35,053 - INFO - Merge operation completed successfully
2025-07-16 19:02:35,055 - INFO - Total records in table after merge: 14
2025-07-16 19:02:35,056 - INFO - ✓ Data merge completed successfully
2025-07-16 19

In [3]:
from pyspark.sql import SparkSession

# create Spark session
spark = SparkSession.builder.appName("Refresh SQL Endpoint Metadata").getOrCreate()

# refresh the specific table
spark.sql("REFRESH TABLE pbi_gateways")
print("Metadata refresh triggered successfully.")


StatementMeta(, afe84ec1-147d-4374-909d-9e60157c61a7, 5, Finished, Available, Finished)

Metadata refresh triggered successfully.
