In [1]:
# DO NOT DELETE THIS CELL

# API Name: Gateways - Get Datasources
# Command:  GET https://api.powerbi.com/v1.0/myorg/gateways/{gatewayId}/datasources
# Doc:      https://learn.microsoft.com/en-us/rest/api/power-bi/gateways/get-datasources

# Loads table: pbi_gateways_datasources

# Note: this queries the pbi_gateways table to get a list of gatewayId values for the API calls.

StatementMeta(, e69eb60a-ecf1-4ce9-8d24-d091e8c1cbc6, 3, Finished, Available, Finished)

In [2]:
# CELL 1 - Title and Introduction
# ==================================
# Power BI Gateway Datasources to Delta Lake - PySpark Notebook
# This notebook retrieves Power BI gateway datasources for all gateways and loads them 
# into a Delta Lake table with optimization for analytics workloads
# ==================================


# CELL 2 - Import Libraries
# ==================================
# Import required libraries
import requests
import json
import pandas as pd
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, current_timestamp, lit, from_json
from pyspark.sql.types import StructType, StructField, StringType, TimestampType
import logging
from typing import Dict, List, Optional
from delta.tables import DeltaTable
import random
import time
# ==================================


# CELL 3 - Configure Logging and Initialize Spark
# ==================================
# Configure logging
# This helps us track what's happening in our code and debug issues
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)

# Initialize Spark Session with Delta Lake configurations
# In Fabric notebooks, Spark is pre-configured with Delta support
spark = SparkSession.builder \
    .appName("PowerBIGatewayDatasourcesToDelta") \
    .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension") \
    .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog") \
    .getOrCreate()

# Set optimal configurations for Delta operations
spark.conf.set("spark.databricks.delta.optimizeWrite.enabled", "true")
spark.conf.set("spark.databricks.delta.autoCompact.enabled", "true")
# ==================================


# CELL 4 - Configuration Parameters
# ==================================
# Configuration Parameters
# These are the settings we'll use throughout the notebook
CONFIG = {
    "API_BASE_URL": "https://api.powerbi.com/v1.0/myorg",
    "DATASOURCES_ENDPOINT": "/gateways/{gatewayId}/datasources",  # Endpoint for gateway datasources
    "MAX_RETRIES": 5,  # Increased number of retries for handling rate limits
    "INITIAL_BACKOFF_SEC": 1,  # Initial backoff time in seconds
    "MAX_BACKOFF_SEC": 60,  # Maximum backoff time in seconds
    "BACKOFF_FACTOR": 2,  # Exponential backoff multiplier
    "JITTER_FACTOR": 0.1,  # Random jitter to add to backoff (as a fraction)
    "TIMEOUT": 30,  # API request timeout in seconds
    "DATASOURCES_TABLE_NAME": "pbi_gateways_datasources",  # Name of the target Delta table
    "GATEWAYS_TABLE_NAME": "pbi_gateways",  # Name of the source gateways table
    "LAKEHOUSE_PATH": "Tables",  # Default Tables folder in Fabric Lakehouse
    "DEBUG_MODE": True  # Set to True to enable extra debugging output
}
# ==================================


# CELL 5 - Authentication Function
# ==================================
def get_access_token():
    """
    Get Azure AD access token for Power BI API authentication.
    
    This function tries multiple methods to obtain an access token:
    1. mssparkutils.credentials.getToken() - Primary method for Fabric
    2. Alternative token endpoints if the primary fails
    3. Manual token input as fallback
    
    Returns:
        str: The access token
    
    Note:
        If automatic token retrieval fails, you may need to manually provide a token
        or check your Fabric workspace permissions.
    """
    
    # Method 1: Try the standard mssparkutils approach
    try:
        logger.info("Attempting to get token using mssparkutils.credentials.getToken()...")
        from notebookutils import mssparkutils
        token_response = mssparkutils.credentials.getToken("https://api.powerbi.com")
        logger.info("Successfully obtained token using mssparkutils")
        return token_response
    except Exception as e:
        logger.warning(f"Primary token method failed: {str(e)}")
        logger.info("Trying alternative token methods...")
    
    # Method 2: Try alternative Power BI API endpoint
    try:
        logger.info("Attempting to get token using alternative endpoint...")
        from notebookutils import mssparkutils
        token_response = mssparkutils.credentials.getToken("https://analysis.windows.net/powerbi/api")
        logger.info("Successfully obtained token using alternative endpoint")
        return token_response
    except Exception as e:
        logger.warning(f"Alternative endpoint failed: {str(e)}")
    
    # Method 3: Try getting token for general Power Platform
    try:
        logger.info("Attempting to get token for Power Platform...")
        from notebookutils import mssparkutils
        token_response = mssparkutils.credentials.getToken("https://api.powerplatform.com")
        logger.info("Successfully obtained token for Power Platform")
        return token_response
    except Exception as e:
        logger.warning(f"Power Platform token failed: {str(e)}")
    
    # Method 4: Try the generic Microsoft Graph endpoint
    try:
        logger.info("Attempting to get token for Microsoft Graph...")
        from notebookutils import mssparkutils
        token_response = mssparkutils.credentials.getToken("https://graph.microsoft.com")
        logger.info("Successfully obtained token for Microsoft Graph")
        return token_response
    except Exception as e:
        logger.warning(f"Microsoft Graph token failed: {str(e)}")
    
    # Method 5: Manual token input (fallback)
    logger.error("All automatic token methods failed.")
    logger.error("Please try one of the following solutions:")
    logger.error("1. Wait a few minutes and retry - this might be a temporary Fabric service issue")
    logger.error("2. Restart your Fabric notebook kernel")
    logger.error("3. Check your workspace permissions and access to Power BI APIs")
    logger.error("4. Contact your Fabric administrator")
    logger.error("")
    logger.error("As a temporary workaround, you can manually provide a token:")
    logger.error("- Go to https://learn.microsoft.com/en-us/rest/api/power-bi/")
    logger.error("- Click 'Try It' on any API")
    logger.error("- Copy the Bearer token from the Authorization header")
    logger.error("- Uncomment and modify the line below with your token:")
    logger.error("")
    logger.error("# MANUAL_TOKEN = 'your_bearer_token_here'")
    logger.error("# return MANUAL_TOKEN")
    
    # Uncomment the lines below and add your manual token if needed
    # MANUAL_TOKEN = "your_bearer_token_here"
    # if MANUAL_TOKEN and MANUAL_TOKEN != "your_bearer_token_here":
    #     logger.info("Using manually provided token")
    #     return MANUAL_TOKEN
    
    raise Exception("Unable to obtain access token using any available method. Please check the error messages above for solutions.")
# ==================================


# CELL 7 - API Call Function
# ==================================
def validate_access_token(access_token: str) -> bool:
    """
    Validate that the access token works with the Power BI API.
    
    This function makes a simple test call to verify the token is valid
    and has the necessary permissions.
    
    Args:
        access_token: The access token to validate
    
    Returns:
        bool: True if token is valid, False otherwise
    """
    try:
        logger.info("Validating access token...")
        
        # Make a simple test call to validate the token
        test_url = f"{CONFIG['API_BASE_URL']}/groups"  # Simple endpoint to test auth
        headers = {
            "Authorization": f"Bearer {access_token}",
            "Content-Type": "application/json"
        }
        
        response = requests.get(test_url, headers=headers, timeout=10)
        
        if response.status_code == 200:
            logger.info("Access token validation successful")
            return True
        elif response.status_code == 401:
            logger.error("Access token is invalid or expired")
            return False
        elif response.status_code == 403:
            logger.error("Access token is valid but lacks required permissions")
            logger.error("Required scopes: Dataset.ReadWrite.All or Dataset.Read.All")
            return False
        else:
            logger.warning(f"Token validation returned status {response.status_code}: {response.text}")
            return False
            
    except Exception as e:
        logger.error(f"Token validation failed: {str(e)}")
        return False
# ==================================
# ==================================
def call_powerbi_api(endpoint: str, access_token: str, params: Optional[Dict] = None) -> Dict:
    """
    Make a REST API call to Power BI with advanced rate limit handling.
    
    This function handles the HTTP request to the Power BI API, including:
    - Setting up authentication headers
    - Managing retries with intelligent backoff for rate limiting (429 errors)
    - Implementing jitter to avoid synchronized retries
    - Detailed error handling and logging
    
    Args:
        endpoint: The API endpoint path (e.g., "/gateways/123/datasources")
        access_token: The Azure AD access token
        params: Optional query parameters for the API call
    
    Returns:
        dict: The JSON response from the API
    
    Raises:
        requests.exceptions.RequestException: If the API call fails after all retries
    """
    url = f"{CONFIG['API_BASE_URL']}{endpoint}"
    headers = {
        "Authorization": f"Bearer {access_token}",
        "Content-Type": "application/json"
    }
    
    # Initialize backoff time
    backoff_time = CONFIG['INITIAL_BACKOFF_SEC']
    
    # Retry logic with intelligent backoff
    for attempt in range(CONFIG['MAX_RETRIES']):
        try:
            # Log the full URL with parameters for debugging
            logger.info(f"Making API call to: {url} with params: {params} (Attempt {attempt + 1})")
            
            response = requests.get(
                url,
                headers=headers,
                params=params,
                timeout=CONFIG['TIMEOUT']
            )
            
            # Log the full response for debugging
            logger.info(f"Response status: {response.status_code}")
            
            # Rate limit handling (429 Too Many Requests)
            if response.status_code == 429:
                # Get retry-after header if available, otherwise use our backoff
                retry_after = response.headers.get('Retry-After')
                
                if retry_after and retry_after.isdigit():
                    # If server specified a wait time, use it
                    wait_time = int(retry_after)
                else:
                    # Calculate wait time with exponential backoff and jitter
                    jitter = random.uniform(0, CONFIG['JITTER_FACTOR'] * backoff_time)
                    wait_time = backoff_time + jitter
                    # Update backoff for next attempt
                    backoff_time = min(backoff_time * CONFIG['BACKOFF_FACTOR'], CONFIG['MAX_BACKOFF_SEC'])
                
                logger.warning(f"Rate limit exceeded (429). Waiting {wait_time:.2f} seconds before retry.")
                time.sleep(wait_time)
                continue  # Skip to next retry attempt without raising exception
            
            # Log the response for debugging in case of errors
            if response.status_code >= 400:
                logger.error(f"API error: Status {response.status_code}, Response: {response.text}")
                logger.error(f"Request URL: {response.request.url}")
                logger.error(f"Request headers: {response.request.headers}")
            
            # For all other status codes, use raise_for_status to handle
            response.raise_for_status()
            
            # If we get here, the request was successful
            # Reset backoff for next API call (not next attempt)
            backoff_time = CONFIG['INITIAL_BACKOFF_SEC']
            
            # Log a sample of the response content for debugging
            try:
                response_json = response.json()
                if "value" in response_json and isinstance(response_json["value"], list):
                    logger.info(f"Response contains {len(response_json['value'])} items in 'value' array")
                return response_json
            except json.JSONDecodeError as e:
                logger.error(f"Failed to parse response as JSON: {str(e)}")
                logger.error(f"Response content: {response.text[:1000]}")  # Log first 1000 chars of response
                raise
            
        except requests.exceptions.RequestException as e:
            last_attempt = attempt == CONFIG['MAX_RETRIES'] - 1
            
            # Special handling for non-429 errors
            if not (hasattr(e, 'response') and e.response is not None and e.response.status_code == 429):
                logger.warning(f"API call failed (Attempt {attempt + 1}): {str(e)}")
                
                if last_attempt:
                    logger.error(f"All retry attempts failed for endpoint: {endpoint}")
                    logger.error(f"Final error: {str(e)}")
                    raise
                
                # Calculate wait time with exponential backoff and jitter
                jitter = random.uniform(0, CONFIG['JITTER_FACTOR'] * backoff_time)
                wait_time = backoff_time + jitter
                # Update backoff for next attempt
                backoff_time = min(backoff_time * CONFIG['BACKOFF_FACTOR'], CONFIG['MAX_BACKOFF_SEC'])
                
                logger.info(f"Waiting {wait_time:.2f} seconds before retry.")
                time.sleep(wait_time)
# ==================================


# CELL 8 - Get Gateway IDs Function
# ==================================
def get_gateway_ids() -> List[str]:
    """
    Retrieve gateway IDs from the existing pbi_gateways table.
    
    This function queries the pbi_gateways table to get all gateway IDs
    that we need to iterate through to collect datasources.
    
    Returns:
        list: A list of gateway ID strings
    """
    try:
        logger.info(f"Querying {CONFIG['GATEWAYS_TABLE_NAME']} table for gateway IDs...")
        
        # Query the gateways table to get all gateway IDs
        gateway_ids_df = spark.sql(f"SELECT id FROM {CONFIG['GATEWAYS_TABLE_NAME']}")
        
        # Convert to list of strings
        gateway_ids = [row.id for row in gateway_ids_df.collect()]
        
        logger.info(f"Found {len(gateway_ids)} gateways to process")
        
        if CONFIG['DEBUG_MODE'] and gateway_ids:
            logger.info(f"Sample gateway IDs: {gateway_ids[:5]}")  # Show first 5
        
        return gateway_ids
        
    except Exception as e:
        logger.error(f"Failed to retrieve gateway IDs from {CONFIG['GATEWAYS_TABLE_NAME']}: {str(e)}")
        logger.error("Make sure the pbi_gateways table exists and contains gateway data")
        raise
# ==================================


# CELL 9 - Get Datasources for Gateway Function
# ==================================
def get_datasources_for_gateway(gateway_id: str, access_token: str) -> List[Dict]:
    """
    Retrieve datasources for a specific gateway from the Power BI API.
    
    This function makes a request to the Get Datasources API endpoint for
    a specific gateway and returns all datasources configured for that gateway.
    
    Args:
        gateway_id: The unique identifier of the gateway
        access_token: The Azure AD access token
    
    Returns:
        list: A list of datasource objects for the specified gateway
    """
    try:
        # Build the endpoint URL for this specific gateway
        endpoint = CONFIG['DATASOURCES_ENDPOINT'].format(gatewayId=gateway_id)
        
        logger.info(f"Retrieving datasources for gateway: {gateway_id}")
        
        # Make the API call
        response_data = call_powerbi_api(endpoint, access_token)
        
        # Extract datasources from the response
        datasources = response_data.get("value", [])
        
        logger.info(f"Retrieved {len(datasources)} datasources for gateway {gateway_id}")
        
        # Log first datasource for debugging
        if CONFIG['DEBUG_MODE'] and datasources:
            logger.info(f"Sample datasource: {json.dumps(datasources[0], indent=2)}")
        
        return datasources
        
    except requests.exceptions.RequestException as e:
        # Log the error but don't fail the entire process
        logger.warning(f"Failed to retrieve datasources for gateway {gateway_id}: {str(e)}")
        return []
    except Exception as e:
        logger.error(f"Unexpected error retrieving datasources for gateway {gateway_id}: {str(e)}")
        return []
# ==================================


# CELL 10 - Get All Datasources Function
# ==================================
def get_all_datasources(access_token: str) -> List[Dict]:
    """
    Retrieve datasources from all gateways.
    
    This function:
    1. Gets all gateway IDs from the pbi_gateways table
    2. Iterates through each gateway to collect its datasources
    3. Combines all datasources into a single list
    
    Args:
        access_token: The Azure AD access token
    
    Returns:
        list: A list of all datasource objects from all gateways
    """
    all_datasources = []
    
    # Get all gateway IDs
    gateway_ids = get_gateway_ids()
    
    if not gateway_ids:
        logger.warning("No gateway IDs found. Cannot retrieve datasources.")
        return []
    
    # Process each gateway
    total_gateways = len(gateway_ids)
    for i, gateway_id in enumerate(gateway_ids, 1):
        logger.info(f"Processing gateway {i}/{total_gateways}: {gateway_id}")
        
        # Get datasources for this gateway
        gateway_datasources = get_datasources_for_gateway(gateway_id, access_token)
        
        if gateway_datasources:
            all_datasources.extend(gateway_datasources)
            logger.info(f"Added {len(gateway_datasources)} datasources. Running total: {len(all_datasources)}")
        else:
            logger.info(f"No datasources found for gateway {gateway_id}")
        
        # Add a small delay between gateway calls to be respectful to the API
        if i < total_gateways:  # Don't sleep after the last gateway
            time.sleep(0.5)
    
    logger.info(f"Finished retrieving datasources from all gateways. Total count: {len(all_datasources)}")
    return all_datasources
# ==================================


# CELL 11 - Create Enhanced DataFrame Function
# ==================================
def create_enhanced_datasources_dataframe(datasources_data: List[Dict]) -> "DataFrame":
    """
    Convert the datasources data into an enhanced PySpark DataFrame for Delta Lake.
    
    This function:
    - Creates a structured DataFrame with the datasource data
    - Extracts the required fields (datasource_id, gateway_id, etc.)
    - Adds metadata columns for tracking
    
    Args:
        datasources_data: List of datasource dictionaries from the API
    
    Returns:
        DataFrame: An enhanced PySpark DataFrame ready for Delta Lake
    """
    # Extract only the fields we need from each datasource
    simplified_datasources = []
    
    for datasource in datasources_data:
        # Extract the core fields based on the API response structure
        simplified_datasource = {
            "datasource_id": datasource.get("id"),
            "gateway_id": datasource.get("gatewayId"),
            "datasource_name": datasource.get("datasourceName"),
            "datasource_type": datasource.get("datasourceType"),
            "credential_type": datasource.get("credentialType"),
            "connection_details": datasource.get("connectionDetails")  # Keep as raw JSON string
        }
        simplified_datasources.append(simplified_datasource)
    
    # Define the schema with the specific fields we need
    schema = StructType([
        StructField("datasource_id", StringType(), False),    # False = not nullable (PRIMARY KEY)
        StructField("gateway_id", StringType(), False),       # False = not nullable (FOREIGN KEY)
        StructField("datasource_name", StringType(), True),   # True = nullable
        StructField("datasource_type", StringType(), True),   # True = nullable
        StructField("credential_type", StringType(), True),   # True = nullable
        StructField("connection_details", StringType(), True), # True = nullable
        StructField("extraction_timestamp", TimestampType(), False)  # False = not nullable
    ])
    
    # Convert the data to a DataFrame
    if not simplified_datasources:
        logger.warning("No datasources found. Creating empty DataFrame.")
        # Create an empty DataFrame with the schema
        empty_rdd = spark.sparkContext.emptyRDD()
        enhanced_df = spark.createDataFrame(empty_rdd, schema)
        return enhanced_df
    
    # Convert to pandas DataFrame first
    pandas_df = pd.DataFrame(simplified_datasources)
    
    # Create the initial Spark DataFrame
    # We don't include extraction_timestamp here as we'll add it next
    required_columns = ["datasource_id", "gateway_id", "datasource_name", "datasource_type", "credential_type", "connection_details"]
    
    # Ensure all columns exist in the pandas DataFrame
    for col_name in required_columns:
        if col_name not in pandas_df.columns:
            pandas_df[col_name] = None
    
    spark_df = spark.createDataFrame(pandas_df[required_columns])
    
    # Add metadata column for tracking when this data was extracted
    enhanced_df = spark_df.withColumn("extraction_timestamp", current_timestamp())
    
    return enhanced_df
# ==================================


# CELL 12 - Delta Lake Operations Functions
# ==================================
def ensure_delta_table_exists(table_name: str, df_schema):
    """
    Ensure the Delta table exists, creating it if necessary.
    
    Args:
        table_name: Name of the Delta table
        df_schema: Schema of the DataFrame
    """
    try:
        # Check if table exists
        spark.sql(f"DESCRIBE TABLE {table_name}")
        logger.info(f"Delta table '{table_name}' already exists")
    except Exception:
        # Table doesn't exist, create it
        logger.info(f"Creating Delta table '{table_name}'")
        
        # Create an empty DataFrame with the schema
        empty_df = spark.createDataFrame([], df_schema)
        
        # Create the Delta table (without partitioning)
        empty_df.write \
            .mode("overwrite") \
            .option("overwriteSchema", "true") \
            .saveAsTable(table_name)
        
        logger.info(f"Delta table '{table_name}' created successfully")


def merge_data_to_delta(source_df, table_name: str):
    """
    Merge new datasource data into the Delta table using MERGE operation.
    
    This function performs an upsert operation:
    - Updates existing records if datasource ID matches
    - Inserts new records if datasource ID doesn't exist
    
    Args:
        source_df: DataFrame with new data
        table_name: Name of the target Delta table
    """
    logger.info(f"Starting merge operation for {table_name}")
    
    # Create a temporary view for the merge operation
    source_df.createOrReplaceTempView("datasource_updates")
    
    # If the table is empty, just insert all records
    if spark.table(table_name).count() == 0:
        logger.info(f"Table {table_name} is empty. Inserting all records.")
        source_df.write.mode("append").saveAsTable(table_name)
        return
    
    # Perform the merge operation
    merge_query = f"""
    MERGE INTO {table_name} AS target
    USING datasource_updates AS source
    ON target.datasource_id = source.datasource_id
    WHEN MATCHED THEN
        UPDATE SET 
            target.gateway_id = source.gateway_id,
            target.datasource_name = source.datasource_name,
            target.datasource_type = source.datasource_type,
            target.credential_type = source.credential_type,
            target.connection_details = source.connection_details,
            target.extraction_timestamp = source.extraction_timestamp
    WHEN NOT MATCHED THEN
        INSERT *
    """
    
    spark.sql(merge_query)
    logger.info("Merge operation completed successfully")


def optimize_delta_table(table_name: str):
    """
    Optimize the Delta table for better query performance.
    
    This function:
    - Updates table statistics for query optimization
    - Uses a more compatible method for Microsoft Fabric
    
    Args:
        table_name: Name of the Delta table to optimize
    """
    logger.info(f"Optimizing Delta table '{table_name}'")
    
    try:
        # Update table statistics for better query planning
        spark.sql(f"ANALYZE TABLE {table_name} COMPUTE STATISTICS")
        logger.info("Table statistics updated successfully")
        
        # Note: In Microsoft Fabric, Delta table optimization may be handled automatically
        # or through different commands than traditional Delta Lake
        # The standard OPTIMIZE and ZORDER commands might not be available
        
        # Alternative approach: Use Delta table properties to hint at optimization
        delta_table = DeltaTable.forName(spark, table_name)
        delta_table_detail = spark.sql(f"DESCRIBE DETAIL {table_name}")
        
        logger.info("Delta table optimization completed via statistics computation")
        logger.info("Note: Microsoft Fabric may automatically optimize Delta tables")
    except Exception as e:
        logger.warning(f"Table optimization step encountered an issue: {str(e)}")
        logger.info("Continuing with process - optimization is not critical for functionality")
# ==================================


# CELL 13 - Main Execution Function
# ==================================
def main():
    """
    Main execution function that orchestrates the entire process.
    
    This function:
    1. Gets the authentication token
    2. Retrieves all datasources from all gateways via the API
    3. Creates an enhanced PySpark DataFrame with the datasource data
    4. Loads data into a Delta Lake table
    5. Optimizes the table for analytics
    """
    try:
        logger.info("Starting Power BI Gateway Datasources to Delta Lake process")
        
        # Step 1: Get authentication token
        logger.info("Getting access token...")
        access_token = get_access_token()
        logger.info("Successfully obtained access token")
        
        # Step 1.5: Validate the token
        if not validate_access_token(access_token):
            raise Exception("Access token validation failed. Please check your permissions and try again.")
        
        # Step 2: Retrieve all datasources from all gateways
        logger.info("Retrieving datasources from Power BI API...")
        datasources_data = get_all_datasources(access_token)
        
        if not datasources_data:
            logger.warning("No datasources found. Please check your permissions and gateway access.")
            # Create empty dataframe with schema for consistent table structure
            empty_schema = StructType([
                StructField("datasource_id", StringType(), False),
                StructField("gateway_id", StringType(), False),
                StructField("datasource_name", StringType(), True),
                StructField("datasource_type", StringType(), True),
                StructField("credential_type", StringType(), True),
                StructField("connection_details", StringType(), True),
                StructField("extraction_timestamp", TimestampType(), False)
            ])
            datasources_df = spark.createDataFrame([], empty_schema)
        else:
            # Step 3: Create enhanced DataFrame
            logger.info(f"Creating DataFrame for {len(datasources_data)} datasources...")
            datasources_df = create_enhanced_datasources_dataframe(datasources_data)
        
        # Show sample data
        logger.info("Sample of enhanced datasources data:")
        datasources_df.show(5, truncate=False)
        
        # Step 4: Prepare Delta table
        table_name = CONFIG["DATASOURCES_TABLE_NAME"]
        ensure_delta_table_exists(table_name, datasources_df.schema)
        
        # Step 5: Merge data into Delta table (if we have data)
        if datasources_data:
            merge_data_to_delta(datasources_df, table_name)
            
            # Step 6: Optimize the Delta table
            optimize_delta_table(table_name)
        
        # Step 7: Display final statistics
        logger.info("Loading completed successfully!")
        
        # Show table information
        spark.sql(f"DESCRIBE DETAIL {table_name}").show(truncate=False)
        
        # Show row count
        row_count = spark.table(table_name).count()
        logger.info(f"Total rows in {table_name}: {row_count}")
        
        # Show summary statistics
        summary_stats = spark.sql(f"""
            SELECT 
                COUNT(*) as total_datasources,
                COUNT(DISTINCT gateway_id) as unique_gateways,
                COUNT(DISTINCT datasource_type) as datasource_types,
                COUNT(DISTINCT credential_type) as credential_types,
                MAX(extraction_timestamp) as last_updated
            FROM {table_name}
        """)
        
        logger.info("Summary statistics:")
        summary_stats.show(truncate=False)
        
        # Optional: Show distribution by datasource type
        type_distribution = spark.sql(f"""
            SELECT 
                datasource_type,
                COUNT(*) as count
            FROM {table_name}
            GROUP BY datasource_type
            ORDER BY count DESC
        """)
        
        logger.info("Datasource distribution by type:")
        type_distribution.show(truncate=False)
        
        # Optional: Show distribution by credential type
        credential_distribution = spark.sql(f"""
            SELECT 
                credential_type,
                COUNT(*) as count
            FROM {table_name}
            GROUP BY credential_type
            ORDER BY count DESC
        """)
        
        logger.info("Datasource distribution by credential type:")
        credential_distribution.show(truncate=False)
        
        return datasources_df
        
    except Exception as e:
        logger.error(f"Error in main execution: {str(e)}")
        raise
# ==================================


# CELL 14 - Execute Main Function
# ==================================
# Execute the main function
if __name__ == "__main__":
    datasources_df = main()
# ==================================

StatementMeta(, e69eb60a-ecf1-4ce9-8d24-d091e8c1cbc6, 4, Finished, Available, Finished)

2025-07-16 19:08:30,066 - INFO - Starting Power BI Gateway Datasources to Delta Lake process
2025-07-16 19:08:30,068 - INFO - Getting access token...
2025-07-16 19:08:30,069 - INFO - Attempting to get token using mssparkutils.credentials.getToken()...
: java.io.IOException: 500 {"code":"INTERNAL_ERROR","subCode":0,"message":"An internal error occurred.","timeStamp":"2025-07-16T19:09:28.8583809Z","httpStatusCode":500,"hresult":-2147467259,"details":[{"code":"RootActivityId","message":"95c3c4e8-a4c0-4229-967c-7c8fb5c7872f"},{"code":"Category","message":"System"},{"code":"Source","message":"TM"}]}
	at com.microsoft.azure.trident.tokenlibrary.TokenLibrary.getAccessToken(TokenLibrary.scala:532)
	at com.microsoft.azure.trident.tokenlibrary.TokenLibrary.getAccessToken(TokenLibrary.scala:448)
	at com.microsoft.azure.trident.tokenlibrary.TokenLibrary$.getAccessToken(TokenLibrary.scala:1324)
	at mssparkutils.credentials$.getToken(credentials.scala:41)
	at mssparkutils.credentials.getToken(creden

+------------------------------------+------------------------------------+----------------------------------------------------------------------------------------------------------------------------------------------------+---------------+---------------+--------------------------------------------------------------------------------------------------------------------------------------------------------------+--------------------------+
|datasource_id                       |gateway_id                          |datasource_name                                                                                                                                     |datasource_type|credential_type|connection_details                                                                                                                                            |extraction_timestamp      |
+------------------------------------+------------------------------------+---------------------------------------

In [3]:
from pyspark.sql import SparkSession

# create Spark session
spark = SparkSession.builder.appName("Refresh SQL Endpoint Metadata").getOrCreate()

# refresh the specific table
spark.sql("REFRESH TABLE pbi_gateways_datasources")
print("Metadata refresh triggered successfully.")


StatementMeta(, e69eb60a-ecf1-4ce9-8d24-d091e8c1cbc6, 5, Finished, Available, Finished)

Metadata refresh triggered successfully.
