In [1]:
# DO NOT DELETE THIS CELL

# IMPORTANT!!! This notebook is only pulling data for object on the MDA Institutional Capacity - PROD capacity! See line 244 for details.

# API Name: Gateways - Get Datasources
# Command:  GET https://api.powerbi.com/v1.0/myorg/gateways/{gatewayId}/datasources
# Doc:      https://learn.microsoft.com/en-us/rest/api/power-bi/gateways/get-datasources

# Loads table: pbi_datasets_datasources
# Loads table: pbi_datasets_datasources_connection_details

# IMPORTANT!!! This notebook is only pulling data for object on the MDA Institutional Capacity - PROD capacity! See line 244 for details.
# Note: this queries the pbi_datasets table to get a list of dataset_Id values for the API calls.

StatementMeta(, bc48bd6a-b0ef-452f-a9f2-189999c838f6, 3, Finished, Available, Finished)

In [2]:
# CELL 1 - Title and Introduction
# ==================================
# Microsoft Fabric Dataset Datasources to Delta Lake - PySpark Notebook
# This notebook retrieves dataset datasources from Power BI Admin API and loads them into Delta Lake tables
# with optimization for analytics workloads
# ==================================



# CELL 2 - Import Libraries
# ==================================
# Import required libraries
import requests
import json
import pandas as pd
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, current_timestamp, lit, explode, when, isnan, isnull
from pyspark.sql.types import StructType, StructField, StringType, TimestampType
import logging
from typing import Dict, List, Optional, Tuple
from delta.tables import DeltaTable
import random
import time
import uuid
from datetime import datetime
# ==================================


# CELL 3 - Configure Logging and Initialize Spark
# ==================================
# Configure logging
# This helps us track what's happening in our code and debug issues
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)

# Initialize Spark Session with Delta Lake configurations
# In Fabric notebooks, Spark is pre-configured with Delta support
spark = SparkSession.builder \
    .appName("FabricDatasetDatasourcesToDelta") \
    .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension") \
    .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog") \
    .getOrCreate()

# Set optimal configurations for Delta operations
spark.conf.set("spark.databricks.delta.optimizeWrite.enabled", "true")
spark.conf.set("spark.databricks.delta.autoCompact.enabled", "true")
# ==================================


# CELL 4 - Configuration Parameters
# ==================================
# Configuration Parameters
# These are the settings we'll use throughout the notebook
CONFIG = {
    "API_BASE_URL": "https://api.powerbi.com/v1.0/myorg",
    "DATASOURCES_ENDPOINT_TEMPLATE": "/admin/datasets/{dataset_id}/datasources",  # Template for datasources endpoint
    "MAX_RETRIES": 5,  # Number of retries for handling rate limits
    "INITIAL_BACKOFF_SEC": 1,  # Initial backoff time in seconds
    "MAX_BACKOFF_SEC": 60,  # Maximum backoff time in seconds
    "BACKOFF_FACTOR": 2,  # Exponential backoff multiplier
    "JITTER_FACTOR": 0.1,  # Random jitter to add to backoff (as a fraction)
    "TIMEOUT": 30,  # API request timeout in seconds
    "DATASOURCES_TABLE_NAME": "pbi_datasets_datasources",  # Main table name
    "CONNECTION_DETAILS_TABLE_NAME": "pbi_datasets_datasources_connection_details",  # Details table name
    "LAKEHOUSE_PATH": "Tables",  # Default Tables folder in Fabric Lakehouse
    "DEBUG_MODE": True,  # Set to True to enable extra debugging output
    "BATCH_SIZE": 50,  # Number of datasets to process in each batch
    "RATE_LIMIT_REQUESTS_PER_HOUR": 300  # API rate limit
}

# Generate a unique batch ID for this execution
BATCH_ID = str(uuid.uuid4())
logger.info(f"Batch ID for this execution: {BATCH_ID}")
# ==================================


# CELL 5 - Authentication Function
# ==================================
def get_access_token():
    """
    Get Azure AD access token for Power BI API authentication.
    
    In a Fabric notebook, the token is automatically available through mssparkutils.
    This function retrieves the token that's needed to authenticate with the Power BI REST API.
    
    Returns:
        str: The access token
    
    Note:
        mssparkutils is a utility library provided by Microsoft Fabric
        that handles authentication automatically.
    """
    try:
        # In Fabric notebooks, we can get the token using mssparkutils
        from notebookutils import mssparkutils
        token_response = mssparkutils.credentials.getToken("https://analysis.windows.net/powerbi/api")
        return token_response
    except Exception as e:
        logger.error(f"Failed to get access token: {str(e)}")
        raise
# ==================================


# CELL 6 - API Call Function
# ==================================
def call_powerbi_api(endpoint: str, access_token: str, params: Optional[Dict] = None) -> Dict:
    """
    Make a REST API call to Power BI with advanced rate limit handling.
    
    This function handles the HTTP request to the Power BI API, including:
    - Setting up authentication headers
    - Managing retries with intelligent backoff for rate limiting (429 errors)
    - Implementing jitter to avoid synchronized retries
    - Detailed error handling and logging
    
    Args:
        endpoint: The API endpoint path (e.g., "/admin/datasets/{id}/datasources")
        access_token: The Azure AD access token
        params: Optional query parameters for the API call
    
    Returns:
        dict: The JSON response from the API
    
    Raises:
        requests.exceptions.RequestException: If the API call fails after all retries
    """
    url = f"{CONFIG['API_BASE_URL']}{endpoint}"
    headers = {
        "Authorization": f"Bearer {access_token}",
        "Content-Type": "application/json"
    }
    
    # Initialize backoff time
    backoff_time = CONFIG['INITIAL_BACKOFF_SEC']
    
    # Retry logic with intelligent backoff
    for attempt in range(CONFIG['MAX_RETRIES']):
        try:
            # Log the full URL with parameters for debugging
            if CONFIG['DEBUG_MODE']:
                logger.info(f"Making API call to: {url} with params: {params} (Attempt {attempt + 1})")
            
            response = requests.get(
                url,
                headers=headers,
                params=params,
                timeout=CONFIG['TIMEOUT']
            )
            
            # Log the response status for debugging
            if CONFIG['DEBUG_MODE']:
                logger.info(f"Response status: {response.status_code}")
            
            # Rate limit handling (429 Too Many Requests)
            if response.status_code == 429:
                # Get retry-after header if available, otherwise use our backoff
                retry_after = response.headers.get('Retry-After')
                
                if retry_after and retry_after.isdigit():
                    # If server specified a wait time, use it
                    wait_time = int(retry_after)
                else:
                    # Calculate wait time with exponential backoff and jitter
                    jitter = random.uniform(0, CONFIG['JITTER_FACTOR'] * backoff_time)
                    wait_time = backoff_time + jitter
                    # Update backoff for next attempt
                    backoff_time = min(backoff_time * CONFIG['BACKOFF_FACTOR'], CONFIG['MAX_BACKOFF_SEC'])
                
                logger.warning(f"Rate limit exceeded (429). Waiting {wait_time:.2f} seconds before retry.")
                time.sleep(wait_time)
                continue  # Skip to next retry attempt without raising exception
            
            # Handle 404 - Dataset not found or no datasources
            if response.status_code == 404:
                logger.warning(f"Dataset not found or no datasources available: {endpoint}")
                return {"value": []}  # Return empty result
            
            # Handle 403 - Forbidden (insufficient permissions)
            if response.status_code == 403:
                logger.warning(f"Insufficient permissions for dataset: {endpoint}")
                return {"value": []}  # Return empty result
            
            # Log other errors for debugging
            if response.status_code >= 400:
                logger.error(f"API error: Status {response.status_code}, Response: {response.text}")
                logger.error(f"Request URL: {response.request.url}")
                logger.error(f"Request headers: {response.request.headers}")
            
            # For all other status codes, use raise_for_status to handle
            response.raise_for_status()
            
            # If we get here, the request was successful
            # Reset backoff for next API call (not next attempt)
            backoff_time = CONFIG['INITIAL_BACKOFF_SEC']
            
            # Parse and return the JSON response
            try:
                response_json = response.json()
                if CONFIG['DEBUG_MODE'] and "value" in response_json:
                    logger.info(f"Response contains {len(response_json['value'])} datasources")
                return response_json
            except json.JSONDecodeError as e:
                logger.error(f"Failed to parse response as JSON: {str(e)}")
                logger.error(f"Response content: {response.text[:1000]}")  # Log first 1000 chars
                raise
            
        except requests.exceptions.RequestException as e:
            last_attempt = attempt == CONFIG['MAX_RETRIES'] - 1
            
            # Special handling for non-429 errors
            if not (hasattr(e, 'response') and e.response is not None and e.response.status_code == 429):
                logger.warning(f"API call failed (Attempt {attempt + 1}): {str(e)}")
                
                if last_attempt:
                    logger.error(f"All retry attempts failed for endpoint: {endpoint}")
                    logger.error(f"Final error: {str(e)}")
                    raise
                
                # Calculate wait time with exponential backoff and jitter
                jitter = random.uniform(0, CONFIG['JITTER_FACTOR'] * backoff_time)
                wait_time = backoff_time + jitter
                # Update backoff for next attempt
                backoff_time = min(backoff_time * CONFIG['BACKOFF_FACTOR'], CONFIG['MAX_BACKOFF_SEC'])
                
                logger.info(f"Waiting {wait_time:.2f} seconds before retry.")
                time.sleep(wait_time)
# ==================================


# CELL 7 - Get Dataset IDs Function
# ==================================
def get_dataset_ids() -> List[str]:
    """
    Get the list of dataset IDs to process by executing the provided SQL query.
    
    This function executes the SQL query that joins capacities, workspaces, and datasets
    to get the dataset IDs for the specific capacity and criteria.
    
    Returns:
        list: A list of dataset ID strings
    """
    logger.info("Executing SQL query to get dataset IDs...")
    
    # SQL query provided in the requirements
    sql_query = """
    SELECT ds.id
    FROM 
        FabricAdmin_Lakehouse.dbo.fabric_capacities AS fc
    JOIN 
        FabricAdmin_Lakehouse.dbo.fabric_workspaces AS fw
        ON UPPER(fc.id) = fw.capacityId  -- Join on capacityId
    JOIN 
        FabricAdmin_Lakehouse.dbo.pbi_datasets AS ds
        ON fw.id = ds.workspaceId  -- Join on workspaceid
    WHERE 
        fc.displayName = 'MDA Institutional Capacity - PROD'  -- Filter for specific displayName
        and fw.state = 'Active'  -- Filter for active workspaces
        AND fw.type <> 'Personal'  -- Exclude personal workspaces
    """
    
    try:
        # Execute the query and collect results
        result_df = spark.sql(sql_query)
        dataset_ids = [row.id for row in result_df.collect()]
        
        logger.info(f"Found {len(dataset_ids)} datasets to process")
        
        if CONFIG['DEBUG_MODE'] and dataset_ids:
            logger.info(f"First 5 dataset IDs: {dataset_ids[:5]}")
        
        return dataset_ids
    
    except Exception as e:
        logger.error(f"Failed to execute SQL query: {str(e)}")
        logger.error(f"Query: {sql_query}")
        raise
# ==================================


# CELL 8 - Get Dataset Datasources Function
# ==================================
def get_dataset_datasources(dataset_id: str, access_token: str) -> Tuple[List[Dict], List[Dict]]:
    """
    Retrieve datasources for a specific dataset from the Power BI API.
    
    This function calls the GetDatasourcesAsAdmin endpoint for a specific dataset
    and returns both the main datasource data and the flattened connection details.
    
    Args:
        dataset_id: The ID of the dataset to get datasources for
        access_token: The Azure AD access token
    
    Returns:
        tuple: (main_datasources_data, connection_details_data)
            - main_datasources_data: List of main datasource records
            - connection_details_data: List of flattened connection detail records
    """
    endpoint = CONFIG['DATASOURCES_ENDPOINT_TEMPLATE'].format(dataset_id=dataset_id)
    
    try:
        response_data = call_powerbi_api(endpoint, access_token)
        datasources = response_data.get("value", [])
        
        if not datasources:
            if CONFIG['DEBUG_MODE']:
                logger.info(f"No datasources found for dataset {dataset_id}")
            return [], []
        
        logger.info(f"Found {len(datasources)} datasources for dataset {dataset_id}")
        
        # Process each datasource
        main_datasources_data = []
        connection_details_data = []
        
        for datasource in datasources:
            # Main datasource record
            main_record = {
                "dataset_id": dataset_id,
                "datasource_id": datasource.get("datasourceId"),
                "datasource_name": datasource.get("name"),
                "datasource_type": datasource.get("datasourceType"),
                "gateway_id": datasource.get("gatewayId"),
                "connection_string": datasource.get("connectionString"),
                "extraction_timestamp": datetime.now(),
                "extraction_batch_id": BATCH_ID
            }
            main_datasources_data.append(main_record)
            
            # Process connection details
            connection_details = datasource.get("connectionDetails", {})
            if connection_details and isinstance(connection_details, dict):
                for property_name, property_value in connection_details.items():
                    # Convert property value to string if it's not None
                    property_value_str = str(property_value) if property_value is not None else None
                    
                    detail_record = {
                        "dataset_id": dataset_id,
                        "datasource_id": datasource.get("datasourceId"),
                        "connection_property": property_name,
                        "connection_value": property_value_str,
                        "extraction_timestamp": datetime.now(),
                        "extraction_batch_id": BATCH_ID
                    }
                    connection_details_data.append(detail_record)
        
        if CONFIG['DEBUG_MODE'] and main_datasources_data:
            logger.info(f"Sample datasource for dataset {dataset_id}: {json.dumps(main_datasources_data[0], default=str, indent=2)}")
        
        return main_datasources_data, connection_details_data
    
    except Exception as e:
        logger.error(f"Failed to get datasources for dataset {dataset_id}: {str(e)}")
        # Return empty lists instead of raising to continue with other datasets
        return [], []
# ==================================


# CELL 9 - Process Datasets in Batches Function
# ==================================
def process_datasets_in_batches(dataset_ids: List[str], access_token: str) -> Tuple[List[Dict], List[Dict]]:
    """
    Process multiple datasets in batches to respect API rate limits.
    
    This function processes the dataset IDs in batches to avoid hitting the
    API rate limit of 300 requests per hour.
    
    Args:
        dataset_ids: List of dataset IDs to process
        access_token: The Azure AD access token
    
    Returns:
        tuple: (all_main_data, all_connection_details_data)
    """
    all_main_data = []
    all_connection_details_data = []
    
    total_datasets = len(dataset_ids)
    batch_size = CONFIG['BATCH_SIZE']
    
    # Calculate timing for rate limiting
    # API allows 300 requests per hour = 5 requests per minute
    # We'll be conservative and do 4 requests per minute
    requests_per_minute = 4
    delay_between_requests = 60 / requests_per_minute  # 15 seconds between requests
    
    logger.info(f"Processing {total_datasets} datasets in batches of {batch_size}")
    logger.info(f"Using {delay_between_requests} seconds delay between API calls to respect rate limits")
    
    for i in range(0, total_datasets, batch_size):
        batch_datasets = dataset_ids[i:i + batch_size]
        batch_num = (i // batch_size) + 1
        total_batches = (total_datasets + batch_size - 1) // batch_size
        
        logger.info(f"Processing batch {batch_num}/{total_batches} ({len(batch_datasets)} datasets)")
        
        batch_main_data = []
        batch_connection_details_data = []
        
        for j, dataset_id in enumerate(batch_datasets):
            try:
                logger.info(f"Processing dataset {i + j + 1}/{total_datasets}: {dataset_id}")
                
                main_data, connection_details_data = get_dataset_datasources(dataset_id, access_token)
                
                batch_main_data.extend(main_data)
                batch_connection_details_data.extend(connection_details_data)
                
                # Add delay between requests to respect rate limits (except for the last request)
                if j < len(batch_datasets) - 1:
                    time.sleep(delay_between_requests)
                
            except Exception as e:
                logger.error(f"Failed to process dataset {dataset_id}: {str(e)}")
                # Continue with next dataset
                continue
        
        # Add batch results to overall results
        all_main_data.extend(batch_main_data)
        all_connection_details_data.extend(batch_connection_details_data)
        
        logger.info(f"Batch {batch_num} completed. Found {len(batch_main_data)} datasources total")
        
        # Add delay between batches if not the last batch
        if batch_num < total_batches:
            batch_delay = 30  # 30 seconds between batches
            logger.info(f"Waiting {batch_delay} seconds before next batch...")
            time.sleep(batch_delay)
    
    logger.info(f"All batches completed. Total datasources found: {len(all_main_data)}")
    logger.info(f"Total connection detail records: {len(all_connection_details_data)}")
    
    return all_main_data, all_connection_details_data
# ==================================


# CELL 10 - Create DataFrames Function
# ==================================
def create_datasources_dataframes(main_data: List[Dict], connection_details_data: List[Dict]) -> Tuple["DataFrame", "DataFrame"]:
    """
    Convert the datasources data into PySpark DataFrames for Delta Lake.
    
    This function creates two DataFrames:
    1. Main datasources DataFrame
    2. Connection details DataFrame
    
    Args:
        main_data: List of main datasource dictionaries
        connection_details_data: List of connection detail dictionaries
    
    Returns:
        tuple: (main_df, connection_details_df)
    """
    # Define schema for main datasources table
    main_schema = StructType([
        StructField("dataset_id", StringType(), False),
        StructField("datasource_id", StringType(), True),
        StructField("datasource_name", StringType(), True),
        StructField("datasource_type", StringType(), True),
        StructField("gateway_id", StringType(), True),
        StructField("connection_string", StringType(), True),
        StructField("extraction_timestamp", TimestampType(), False),
        StructField("extraction_batch_id", StringType(), False)
    ])
    
    # Define schema for connection details table
    connection_details_schema = StructType([
        StructField("dataset_id", StringType(), False),
        StructField("datasource_id", StringType(), True),
        StructField("connection_property", StringType(), False),
        StructField("connection_value", StringType(), True),
        StructField("extraction_timestamp", TimestampType(), False),
        StructField("extraction_batch_id", StringType(), False)
    ])
    
    # Create main datasources DataFrame
    if main_data:
        main_pandas_df = pd.DataFrame(main_data)
        main_df = spark.createDataFrame(main_pandas_df, schema=main_schema)
        logger.info(f"Created main datasources DataFrame with {main_df.count()} rows")
    else:
        logger.warning("No main datasource data found. Creating empty DataFrame.")
        main_df = spark.createDataFrame([], main_schema)
    
    # Create connection details DataFrame
    if connection_details_data:
        connection_details_pandas_df = pd.DataFrame(connection_details_data)
        connection_details_df = spark.createDataFrame(connection_details_pandas_df, schema=connection_details_schema)
        logger.info(f"Created connection details DataFrame with {connection_details_df.count()} rows")
    else:
        logger.warning("No connection details data found. Creating empty DataFrame.")
        connection_details_df = spark.createDataFrame([], connection_details_schema)
    
    return main_df, connection_details_df
# ==================================


# CELL 11 - Delta Lake Operations Functions
# ==================================
def ensure_delta_table_exists(table_name: str, df_schema):
    """
    Ensure the Delta table exists, creating it if necessary.
    
    Args:
        table_name: Name of the Delta table
        df_schema: Schema of the DataFrame
    """
    try:
        # Check if table exists
        spark.sql(f"DESCRIBE TABLE {table_name}")
        logger.info(f"Delta table '{table_name}' already exists")
    except Exception:
        # Table doesn't exist, create it
        logger.info(f"Creating Delta table '{table_name}'")
        
        # Create an empty DataFrame with the schema
        empty_df = spark.createDataFrame([], df_schema)
        
        # Create the Delta table
        empty_df.write \
            .mode("overwrite") \
            .option("overwriteSchema", "true") \
            .saveAsTable(table_name)
        
        logger.info(f"Delta table '{table_name}' created successfully")


def merge_data_to_delta(source_df, table_name: str, merge_keys: List[str]):
    """
    Merge new data into the Delta table using MERGE operation.
    
    This function performs an upsert operation:
    - Updates existing records if the merge keys match
    - Inserts new records if the merge keys don't exist
    
    Args:
        source_df: DataFrame with new data
        table_name: Name of the target Delta table
        merge_keys: List of column names to use for matching records
    """
    logger.info(f"Starting merge operation for {table_name}")
    
    # Create a temporary view for the merge operation
    source_df.createOrReplaceTempView("source_updates")
    
    # If the table is empty, just insert all records
    if spark.table(table_name).count() == 0:
        logger.info(f"Table {table_name} is empty. Inserting all records.")
        source_df.write.mode("append").saveAsTable(table_name)
        return
    
    # Build the merge condition
    merge_condition = " AND ".join([f"target.{key} = source.{key}" for key in merge_keys])
    
    # Get all columns for the UPDATE SET clause (excluding merge keys to avoid redundancy)
    all_columns = source_df.columns
    update_columns = [col for col in all_columns if col not in merge_keys]
    update_set_clause = ", ".join([f"target.{col} = source.{col}" for col in update_columns])
    
    # Perform the merge operation
    merge_query = f"""
    MERGE INTO {table_name} AS target
    USING source_updates AS source
    ON {merge_condition}
    WHEN MATCHED THEN
        UPDATE SET {update_set_clause}
    WHEN NOT MATCHED THEN
        INSERT *
    """
    
    if CONFIG['DEBUG_MODE']:
        logger.info(f"Merge query: {merge_query}")
    
    spark.sql(merge_query)
    logger.info("Merge operation completed successfully")


def optimize_delta_table(table_name: str):
    """
    Optimize the Delta table for better query performance.
    
    Args:
        table_name: Name of the Delta table to optimize
    """
    logger.info(f"Optimizing Delta table '{table_name}'")
    
    try:
        # Update table statistics for better query planning
        spark.sql(f"ANALYZE TABLE {table_name} COMPUTE STATISTICS")
        logger.info("Table statistics updated successfully")
        
        logger.info("Delta table optimization completed via statistics computation")
        logger.info("Note: Microsoft Fabric may automatically optimize Delta tables")
    except Exception as e:
        logger.warning(f"Table optimization step encountered an issue: {str(e)}")
        logger.info("Continuing with process - optimization is not critical for functionality")
# ==================================


# CELL 12 - Main Execution Function
# ==================================
def main():
    """
    Main execution function that orchestrates the entire process.
    
    This function:
    1. Gets the authentication token
    2. Retrieves dataset IDs from the SQL query
    3. Processes datasets in batches to get their datasources
    4. Creates PySpark DataFrames with the data
    5. Loads data into Delta Lake tables
    6. Optimizes the tables for analytics
    """
    try:
        logger.info("Starting Dataset Datasources to Delta Lake process")
        
        # Step 1: Get authentication token
        logger.info("Getting access token...")
        access_token = get_access_token()
        logger.info("Successfully obtained access token")
        
        # Step 2: Get dataset IDs to process
        logger.info("Getting dataset IDs from SQL query...")
        dataset_ids = get_dataset_ids()
        
        if not dataset_ids:
            logger.warning("No dataset IDs found. Please check your SQL query and data.")
            return
        
        # Step 3: Process datasets in batches
        logger.info("Processing datasets to get datasources...")
        main_data, connection_details_data = process_datasets_in_batches(dataset_ids, access_token)
        
        # Step 4: Create DataFrames
        logger.info("Creating DataFrames...")
        main_df, connection_details_df = create_datasources_dataframes(main_data, connection_details_data)
        
        # Show sample data
        if main_df.count() > 0:
            logger.info("Sample of main datasources data:")
            main_df.show(5, truncate=False)
        
        if connection_details_df.count() > 0:
            logger.info("Sample of connection details data:")
            connection_details_df.show(10, truncate=False)
        
        # Step 5: Prepare Delta tables
        main_table_name = CONFIG["DATASOURCES_TABLE_NAME"]
        connection_details_table_name = CONFIG["CONNECTION_DETAILS_TABLE_NAME"]
        
        ensure_delta_table_exists(main_table_name, main_df.schema)
        ensure_delta_table_exists(connection_details_table_name, connection_details_df.schema)
        
        # Step 6: Merge data into Delta tables
        if main_df.count() > 0:
            merge_data_to_delta(main_df, main_table_name, ["dataset_id", "datasource_id"])
            optimize_delta_table(main_table_name)
        
        if connection_details_df.count() > 0:
            merge_data_to_delta(connection_details_df, connection_details_table_name, 
                              ["dataset_id", "datasource_id", "connection_property"])
            optimize_delta_table(connection_details_table_name)
        
        # Step 7: Display final statistics
        logger.info("Loading completed successfully!")
        
        # Show table information for main table
        logger.info(f"Main table ({main_table_name}) details:")
        spark.sql(f"DESCRIBE DETAIL {main_table_name}").show(truncate=False)
        
        main_row_count = spark.table(main_table_name).count()
        logger.info(f"Total rows in {main_table_name}: {main_row_count}")
        
        # Show table information for connection details table
        logger.info(f"Connection details table ({connection_details_table_name}) details:")
        spark.sql(f"DESCRIBE DETAIL {connection_details_table_name}").show(truncate=False)
        
        connection_details_row_count = spark.table(connection_details_table_name).count()
        logger.info(f"Total rows in {connection_details_table_name}: {connection_details_row_count}")
        
        # Show summary statistics for main table
        if main_row_count > 0:
            summary_stats = spark.sql(f"""
                SELECT 
                    COUNT(*) as total_datasources,
                    COUNT(DISTINCT dataset_id) as unique_datasets,
                    COUNT(DISTINCT datasource_type) as datasource_types,
                    COUNT(DISTINCT gateway_id) as unique_gateways,
                    MAX(extraction_timestamp) as last_updated
                FROM {main_table_name}
            """)
            
            logger.info("Main table summary statistics:")
            summary_stats.show(truncate=False)
            
            # Show distribution by datasource type
            type_distribution = spark.sql(f"""
                SELECT 
                    datasource_type,
                    COUNT(*) as count
                FROM {main_table_name}
                GROUP BY datasource_type
                ORDER BY count DESC
            """)
            
            logger.info("Datasource distribution by type:")
            type_distribution.show(truncate=False)
        
        # Show summary for connection details
        if connection_details_row_count > 0:
            connection_summary = spark.sql(f"""
                SELECT 
                    COUNT(*) as total_connection_properties,
                    COUNT(DISTINCT connection_property) as unique_properties,
                    COUNT(DISTINCT dataset_id) as datasets_with_details
                FROM {connection_details_table_name}
            """)
            
            logger.info("Connection details summary:")
            connection_summary.show(truncate=False)
        
        return main_df, connection_details_df
        
    except Exception as e:
        logger.error(f"Error in main execution: {str(e)}")
        raise
# ==================================


# CELL 13 - Execute Main Function
# ==================================
# Execute the main function
if __name__ == "__main__":
    main_df, connection_details_df = main()
# ==================================


StatementMeta(, bc48bd6a-b0ef-452f-a9f2-189999c838f6, 4, Finished, Available, Finished)

2025-07-17 15:39:00,938 - INFO - Batch ID for this execution: da1dd779-45ba-42f0-bf98-49a53d0d516c
2025-07-17 15:39:00,942 - INFO - Starting Dataset Datasources to Delta Lake process
2025-07-17 15:39:00,942 - INFO - Getting access token...
2025-07-17 15:39:00,957 - INFO - Successfully obtained access token
2025-07-17 15:39:00,957 - INFO - Getting dataset IDs from SQL query...
2025-07-17 15:39:00,957 - INFO - Executing SQL query to get dataset IDs...
2025-07-17 15:39:21,253 - INFO - Found 147 datasets to process
2025-07-17 15:39:21,255 - INFO - First 5 dataset IDs: ['d9a141b5-20e2-4c8a-825b-02c0f63bd691', 'c6c5b1e8-4f0c-400e-a396-72048f9b3871', '187c3787-c4a5-47a3-a35b-149b72bbd070', '09eac459-9b71-4e9b-8771-45a142078a0d', '36e4c8d8-d521-4577-93f8-307633711275']
2025-07-17 15:39:21,257 - INFO - Processing datasets to get datasources...
2025-07-17 15:39:21,258 - INFO - Processing 147 datasets in batches of 50
2025-07-17 15:39:21,258 - INFO - Using 15.0 seconds delay between API calls to 

+------------------------------------+------------------------------------+---------------+---------------+------------------------------------+-----------------+--------------------------+------------------------------------+
|dataset_id                          |datasource_id                       |datasource_name|datasource_type|gateway_id                          |connection_string|extraction_timestamp      |extraction_batch_id                 |
+------------------------------------+------------------------------------+---------------+---------------+------------------------------------+-----------------+--------------------------+------------------------------------+
|d9a141b5-20e2-4c8a-825b-02c0f63bd691|6270f3eb-9cd1-4028-b4eb-25713ef9c5e2|NULL           |Extension      |7e508ae6-3f91-43a5-ba20-e24caa4b7fd5|NULL             |2025-07-17 15:39:21.651736|da1dd779-45ba-42f0-bf98-49a53d0d516c|
|c6c5b1e8-4f0c-400e-a396-72048f9b3871|30d92dc2-4a7b-4719-92a7-97c26a879d8a|NULL           |E

In [3]:
from pyspark.sql import SparkSession

# create Spark session
spark = SparkSession.builder.appName("Refresh SQL Endpoint Metadata").getOrCreate()

# refresh the specific table
spark.sql("REFRESH TABLE pbi_datasets_datasources")
print("Metadata refresh triggered successfully.")

# refresh the specific table
spark.sql("REFRESH TABLE pbi_datasets_datasources_connection_details")
print("Metadata refresh triggered successfully.")


StatementMeta(, bc48bd6a-b0ef-452f-a9f2-189999c838f6, 5, Finished, Available, Finished)

Metadata refresh triggered successfully.
Metadata refresh triggered successfully.
