In [1]:
# DO NOT DELETE THIS CELL

# ***IMPORTANT***: This only pulls datasources for workspaces on the "MDA Institutional Capacity - PROD" capacity!!!!  See query around line 90!
# Some of these API calls result in "404 Client Error: Not Found for url". I *think* this is due to a datasource not existing for a particular dataset.

# API Name: Datasets - Get Datasources
# Command:  GET https://api.powerbi.com/v1.0/myorg/datasets/{datasetId}/datasources
# Doc:      https://learn.microsoft.com/en-us/rest/api/power-bi/datasets/get-datasources

# Loads table: pbi_datasources
# Loads table: pbi_datasources_connection_details


StatementMeta(, fad4e198-0861-46c0-97cc-62ecf271dbdc, 3, Finished, Available, Finished)

In [2]:
# CELL 1 - Title and Introduction
# ==================================
# Microsoft Power BI Datasources to Delta Lake - PySpark Notebook
# This notebook retrieves Power BI datasources using the Get Datasources API and loads them into Delta Lake tables
# with optimization for analytics workloads. Data is stored in two normalized tables:
# 1. pbi_datasources - Main datasource information
# 2. pbi_datasources_connection_details - Connection detail properties in key-value format
# ==================================


# CELL 2 - Import Libraries
# ==================================
# Import required libraries for API calls, data processing, and Delta Lake operations
import requests
import json
import pandas as pd
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, current_timestamp, lit, from_json, explode, create_map
from pyspark.sql.types import StructType, StructField, StringType, TimestampType, ArrayType, MapType
import logging
from typing import Dict, List, Optional, Tuple
from delta.tables import DeltaTable
import random
import time
import uuid
from datetime import datetime
# ==================================


# CELL 3 - Configure Logging and Initialize Spark
# ==================================
# Configure logging to track execution and debug issues
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)

# Initialize Spark Session with Delta Lake configurations
# In Fabric notebooks, Spark is pre-configured with Delta support
spark = SparkSession.builder \
    .appName("PowerBIDatasourcesToDelta") \
    .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension") \
    .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog") \
    .getOrCreate()

# Set optimal configurations for Delta operations
spark.conf.set("spark.databricks.delta.optimizeWrite.enabled", "true")
spark.conf.set("spark.databricks.delta.autoCompact.enabled", "true")
# ==================================


# CELL 4 - Configuration Parameters
# ==================================
# Configuration Parameters for the notebook execution
CONFIG = {
    "API_BASE_URL": "https://api.powerbi.com/v1.0/myorg",
    "DATASOURCES_ENDPOINT": "/datasets/{dataset_id}/datasources",  # Endpoint for getting datasources
    "MAX_RETRIES": 1,  # Number of retries for handling rate limits and transient failures
    "INITIAL_BACKOFF_SEC": 1,  # Initial backoff time in seconds
    "MAX_BACKOFF_SEC": 60,  # Maximum backoff time in seconds  
    "BACKOFF_FACTOR": 2,  # Exponential backoff multiplier
    "JITTER_FACTOR": 0.1,  # Random jitter to add to backoff (as a fraction)
    "TIMEOUT": 30,  # API request timeout in seconds
    "DATASOURCES_TABLE_NAME": "pbi_datasources",  # Main datasources table
    "CONNECTION_DETAILS_TABLE_NAME": "pbi_datasources_connection_details",  # Connection details table
    "LAKEHOUSE_PATH": "Tables",  # Default Tables folder in Fabric Lakehouse
    "DEBUG_MODE": True,  # Set to True to enable extra debugging output
    "TEST_LIMIT": None,  # Limit to 20 datasets for testing (will be removed later)
    "BATCH_SIZE": 10  # Process datasets in batches to manage API rate limits
}

# Generate a unique batch ID for this execution run
BATCH_ID = str(uuid.uuid4())
logger.info(f"Starting execution with batch ID: {BATCH_ID}")
# ==================================


# CELL 5 - Get Dataset IDs Function
# ==================================
def get_dataset_ids() -> List[str]:
    """
    Execute the SQL query to get dataset IDs from the existing Fabric tables.
    
    This function runs the provided SQL query to get dataset IDs from datasets
    that belong to the 'MDA Institutional Capacity - PROD' capacity.
    
    Returns:
        list: A list of dataset ID strings
    """
    try:
        logger.info("Executing SQL query to retrieve dataset IDs...")
        
        # Execute the provided SQL query
        sql_query = """
        SELECT ds.id
        FROM 
            FabricAdmin_Lakehouse.dbo.fabric_capacities AS fc
        JOIN 
            FabricAdmin_Lakehouse.dbo.fabric_workspaces AS fw
            ON UPPER(fc.id) = fw.capacityId  
        JOIN 
            FabricAdmin_Lakehouse.dbo.pbi_datasets AS ds
            ON fw.id = ds.workspaceId  
        WHERE 
            fc.displayName = 'MDA Institutional Capacity - PROD'  
            AND fw.state = 'Active'  
            AND fw.type <> 'Personal'
        """
        
        # Add test limit for initial testing
        if CONFIG['TEST_LIMIT']:
            sql_query += f" LIMIT {CONFIG['TEST_LIMIT']}"
            logger.info(f"Limiting to {CONFIG['TEST_LIMIT']} datasets for testing")
        
        # Execute the query
        result_df = spark.sql(sql_query)
        
        # Convert to list of dataset IDs
        dataset_ids = [row.id for row in result_df.collect()]
        
        logger.info(f"Retrieved {len(dataset_ids)} dataset IDs from the database")
        
        if CONFIG['DEBUG_MODE'] and dataset_ids:
            logger.info(f"Sample dataset IDs: {dataset_ids[:5]}")
        
        return dataset_ids
        
    except Exception as e:
        logger.error(f"Failed to retrieve dataset IDs: {str(e)}")
        raise
# ==================================


# CELL 6 - Authentication Function
# ==================================
def get_access_token():
    """
    Get Azure AD access token for Power BI API authentication.
    
    In a Fabric notebook, the token is automatically available through mssparkutils.
    This function retrieves the token needed to authenticate with the Power BI REST API.
    
    Returns:
        str: The access token
    
    Note:
        mssparkutils is a utility library provided by Microsoft Fabric
        that handles authentication automatically.
    """
    try:
        # In Fabric notebooks, we can get the token using mssparkutils
        from notebookutils import mssparkutils
        token_response = mssparkutils.credentials.getToken("https://analysis.windows.net/powerbi/api")
        return token_response
    except Exception as e:
        logger.error(f"Failed to get access token: {str(e)}")
        raise
# ==================================


# CELL 7 - API Call Function
# ==================================
def call_powerbi_api(endpoint: str, access_token: str, params: Optional[Dict] = None) -> Dict:
    """
    Make a REST API call to Power BI with advanced rate limit handling.
    
    This function handles the HTTP request to the Power BI API, including:
    - Setting up authentication headers
    - Managing retries with intelligent backoff for rate limiting (429 errors)
    - Implementing jitter to avoid synchronized retries
    - Detailed error handling and logging
    
    Args:
        endpoint: The API endpoint path (e.g., "/datasets/{dataset_id}/datasources")
        access_token: The Azure AD access token
        params: Optional query parameters for the API call
    
    Returns:
        dict: The JSON response from the API
    
    Raises:
        requests.exceptions.RequestException: If the API call fails after all retries
    """
    url = f"{CONFIG['API_BASE_URL']}{endpoint}"
    headers = {
        "Authorization": f"Bearer {access_token}",
        "Content-Type": "application/json"
    }
    
    # Initialize backoff time
    backoff_time = CONFIG['INITIAL_BACKOFF_SEC']
    
    # Retry logic with intelligent backoff
    for attempt in range(CONFIG['MAX_RETRIES']):
        try:
            # Log the full URL with parameters for debugging
            if CONFIG['DEBUG_MODE']:
                logger.info(f"Making API call to: {url} with params: {params} (Attempt {attempt + 1})")
            
            response = requests.get(
                url,
                headers=headers,
                params=params,
                timeout=CONFIG['TIMEOUT']
            )
            
            # Log the response status for debugging
            if CONFIG['DEBUG_MODE']:
                logger.info(f"Response status: {response.status_code}")
            
            # Rate limit handling (429 Too Many Requests)
            if response.status_code == 429:
                # Get retry-after header if available, otherwise use our backoff
                retry_after = response.headers.get('Retry-After')
                
                if retry_after and retry_after.isdigit():
                    # If server specified a wait time, use it
                    wait_time = int(retry_after)
                else:
                    # Calculate wait time with exponential backoff and jitter
                    jitter = random.uniform(0, CONFIG['JITTER_FACTOR'] * backoff_time)
                    wait_time = backoff_time + jitter
                    # Update backoff for next attempt
                    backoff_time = min(backoff_time * CONFIG['BACKOFF_FACTOR'], CONFIG['MAX_BACKOFF_SEC'])
                
                logger.warning(f"Rate limit exceeded (429). Waiting {wait_time:.2f} seconds before retry.")
                time.sleep(wait_time)
                continue  # Skip to next retry attempt without raising exception
            
            # Log the response for debugging in case of errors
            if response.status_code >= 400:
                logger.error(f"API error: Status {response.status_code}, Response: {response.text}")
                logger.error(f"Request URL: {response.request.url}")
                logger.error(f"Request headers: {response.request.headers}")
            
            # For all other status codes, use raise_for_status to handle
            response.raise_for_status()
            
            # If we get here, the request was successful
            # Reset backoff for next API call (not next attempt)
            backoff_time = CONFIG['INITIAL_BACKOFF_SEC']
            
            # Log a sample of the response content for debugging
            try:
                response_json = response.json()
                if "value" in response_json and isinstance(response_json["value"], list):
                    logger.info(f"Response contains {len(response_json['value'])} datasources")
                return response_json
            except json.JSONDecodeError as e:
                logger.error(f"Failed to parse response as JSON: {str(e)}")
                logger.error(f"Response content: {response.text[:1000]}")  # Log first 1000 chars of response
                raise
            
        except requests.exceptions.RequestException as e:
            last_attempt = attempt == CONFIG['MAX_RETRIES'] - 1
            
            # Special handling for non-429 errors
            if not (hasattr(e, 'response') and e.response is not None and e.response.status_code == 429):
                logger.warning(f"API call failed (Attempt {attempt + 1}): {str(e)}")
                
                if last_attempt:
                    logger.error(f"All retry attempts failed for endpoint: {endpoint}")
                    logger.error(f"Final error: {str(e)}")
                    raise
                
                # Calculate wait time with exponential backoff and jitter
                jitter = random.uniform(0, CONFIG['JITTER_FACTOR'] * backoff_time)
                wait_time = backoff_time + jitter
                # Update backoff for next attempt
                backoff_time = min(backoff_time * CONFIG['BACKOFF_FACTOR'], CONFIG['MAX_BACKOFF_SEC'])
                
                logger.info(f"Waiting {wait_time:.2f} seconds before retry.")
                time.sleep(wait_time)
# ==================================


# CELL 8 - Get Datasources Function
# ==================================
def get_datasources_for_dataset(dataset_id: str, access_token: str) -> Tuple[List[Dict], List[Dict]]:
    """
    Retrieve datasources for a specific dataset from the Power BI API.
    
    This function calls the Get Datasources API for a single dataset and
    processes the response to separate main datasource info from connection details.
    
    Args:
        dataset_id: The dataset ID to get datasources for
        access_token: The Azure AD access token
    
    Returns:
        tuple: (datasources_records, connection_details_records)
            - datasources_records: List of main datasource info dictionaries
            - connection_details_records: List of connection detail key-value dictionaries
    """
    try:
        # Construct the endpoint with the dataset ID
        endpoint = CONFIG['DATASOURCES_ENDPOINT'].format(dataset_id=dataset_id)
        
        # Make the API call
        response_data = call_powerbi_api(endpoint, access_token)
        
        # Extract datasources from the response
        datasources = response_data.get("value", [])
        
        if not datasources:
            logger.info(f"No datasources found for dataset {dataset_id}")
            return [], []
        
        logger.info(f"Retrieved {len(datasources)} datasources for dataset {dataset_id}")
        
        # Process each datasource to separate main info from connection details
        datasources_records = []
        connection_details_records = []
        
        for datasource in datasources:
            # Extract main datasource information
            main_record = {
                "dataset_id": dataset_id,
                "datasource_type": datasource.get("datasourceType"),
                "datasource_id": datasource.get("datasourceId"),
                "gateway_id": datasource.get("gatewayId"),
                "connection_string": datasource.get("connectionString"),  # Deprecated field
                "name": datasource.get("name"),  # Deprecated field
                "extraction_date": datetime.now(),
                "extraction_batch_id": BATCH_ID
            }
            datasources_records.append(main_record)
            
            # Extract connection details (if they exist)
            connection_details = datasource.get("connectionDetails", {})
            
            if connection_details and isinstance(connection_details, dict):
                for property_name, property_value in connection_details.items():
                    # Only include non-null values
                    if property_value is not None:
                        detail_record = {
                            "dataset_id": dataset_id,
                            "datasource_id": datasource.get("datasourceId"),
                            "connection_property": property_name,
                            "connection_value": str(property_value),  # Convert to string for consistency
                            "extraction_date": datetime.now(),
                            "extraction_batch_id": BATCH_ID
                        }
                        connection_details_records.append(detail_record)
        
        if CONFIG['DEBUG_MODE']:
            logger.info(f"Dataset {dataset_id}: {len(datasources_records)} datasources, {len(connection_details_records)} connection details")
            
            # Log a sample datasource for debugging
            if datasources_records:
                logger.info(f"Sample datasource: {json.dumps(datasources[0], indent=2)}")
        
        return datasources_records, connection_details_records
        
    except Exception as e:
        logger.error(f"Failed to retrieve datasources for dataset {dataset_id}: {str(e)}")
        # Return empty lists instead of raising to allow processing to continue
        return [], []
# ==================================


# CELL 9 - Batch Processing Function
# ==================================
def process_datasets_in_batches(dataset_ids: List[str], access_token: str) -> Tuple[List[Dict], List[Dict]]:
    """
    Process datasets in batches to retrieve all datasources efficiently.
    
    This function processes datasets in smaller batches to:
    - Manage API rate limits more effectively
    - Provide progress updates
    - Handle failures gracefully
    
    Args:
        dataset_ids: List of dataset IDs to process
        access_token: The Azure AD access token
    
    Returns:
        tuple: (all_datasources_records, all_connection_details_records)
    """
    all_datasources = []
    all_connection_details = []
    
    total_datasets = len(dataset_ids)
    batch_size = CONFIG['BATCH_SIZE']
    
    logger.info(f"Processing {total_datasets} datasets in batches of {batch_size}")
    
    # Process datasets in batches
    for i in range(0, total_datasets, batch_size):
        batch_end = min(i + batch_size, total_datasets)
        batch_datasets = dataset_ids[i:batch_end]
        batch_number = (i // batch_size) + 1
        total_batches = (total_datasets + batch_size - 1) // batch_size
        
        logger.info(f"Processing batch {batch_number}/{total_batches} (datasets {i+1}-{batch_end})")
        
        # Process each dataset in the current batch
        for j, dataset_id in enumerate(batch_datasets):
            try:
                logger.info(f"Processing dataset {i+j+1}/{total_datasets}: {dataset_id}")
                
                datasources, connection_details = get_datasources_for_dataset(dataset_id, access_token)
                
                # Add to our collections
                all_datasources.extend(datasources)
                all_connection_details.extend(connection_details)
                
                # Add a small delay between API calls to be respectful
                time.sleep(0.1)
                
            except Exception as e:
                logger.error(f"Failed to process dataset {dataset_id}: {str(e)}")
                # Continue with the next dataset instead of failing the entire batch
                continue
        
        # Add a longer delay between batches to manage rate limits
        if batch_number < total_batches:
            logger.info(f"Completed batch {batch_number}. Waiting 2 seconds before next batch...")
            time.sleep(2)
    
    logger.info(f"Batch processing completed. Total datasources: {len(all_datasources)}, Total connection details: {len(all_connection_details)}")
    
    return all_datasources, all_connection_details
# ==================================


# CELL 10 - Create DataFrames Function
# ==================================
def create_datasources_dataframes(datasources_data: List[Dict], connection_details_data: List[Dict]) -> Tuple["DataFrame", "DataFrame"]:
    """
    Convert the datasources data into PySpark DataFrames for Delta Lake storage.
    
    This function creates two separate DataFrames:
    1. Main datasources DataFrame with core datasource information
    2. Connection details DataFrame with key-value pairs for connection properties
    
    Args:
        datasources_data: List of main datasource dictionaries
        connection_details_data: List of connection detail dictionaries
    
    Returns:
        tuple: (datasources_df, connection_details_df)
    """
    
    # Define schema for main datasources table
    datasources_schema = StructType([
        StructField("dataset_id", StringType(), False),           # Required - dataset this belongs to
        StructField("datasource_type", StringType(), True),       # Type of datasource (Sql, Oracle, etc.)
        StructField("datasource_id", StringType(), True),         # Unique datasource identifier (can be null)
        StructField("gateway_id", StringType(), True),            # Gateway ID (nullable)
        StructField("connection_string", StringType(), True),     # Deprecated field (nullable)
        StructField("name", StringType(), True),                  # Deprecated field (nullable)
        StructField("extraction_date", TimestampType(), False),   # When data was extracted
        StructField("extraction_batch_id", StringType(), False)   # Batch identifier
    ])
    
    # Define schema for connection details table
    connection_details_schema = StructType([
        StructField("dataset_id", StringType(), False),           # Required - dataset this belongs to
        StructField("datasource_id", StringType(), True),         # Datasource ID (can be null)
        StructField("connection_property", StringType(), False),  # Property name (server, database, etc.)
        StructField("connection_value", StringType(), True),      # Property value (nullable)
        StructField("extraction_date", TimestampType(), False),   # When data was extracted
        StructField("extraction_batch_id", StringType(), False)   # Batch identifier
    ])
    
    # Create main datasources DataFrame
    if datasources_data:
        logger.info(f"Creating datasources DataFrame with {len(datasources_data)} records")
        datasources_pandas_df = pd.DataFrame(datasources_data)
        datasources_df = spark.createDataFrame(datasources_pandas_df, schema=datasources_schema)
    else:
        logger.warning("No datasources data found. Creating empty DataFrame.")
        datasources_df = spark.createDataFrame([], datasources_schema)
    
    # Create connection details DataFrame
    if connection_details_data:
        logger.info(f"Creating connection details DataFrame with {len(connection_details_data)} records")
        connection_details_pandas_df = pd.DataFrame(connection_details_data)
        connection_details_df = spark.createDataFrame(connection_details_pandas_df, schema=connection_details_schema)
    else:
        logger.warning("No connection details data found. Creating empty DataFrame.")
        connection_details_df = spark.createDataFrame([], connection_details_schema)
    
    return datasources_df, connection_details_df
# ==================================


# CELL 11 - Delta Lake Operations Functions
# ==================================
def ensure_delta_table_exists(table_name: str, df_schema):
    """
    Ensure the Delta table exists, creating it if necessary.
    
    Args:
        table_name: Name of the Delta table
        df_schema: Schema of the DataFrame
    """
    try:
        # Check if table exists
        spark.sql(f"DESCRIBE TABLE {table_name}")
        logger.info(f"Delta table '{table_name}' already exists")
    except Exception:
        # Table doesn't exist, create it
        logger.info(f"Creating Delta table '{table_name}'")
        
        # Create an empty DataFrame with the schema
        empty_df = spark.createDataFrame([], df_schema)
        
        # Create the Delta table
        empty_df.write \
            .mode("overwrite") \
            .option("overwriteSchema", "true") \
            .saveAsTable(table_name)
        
        logger.info(f"Delta table '{table_name}' created successfully")


def merge_datasources_to_delta(source_df, table_name: str, merge_keys: List[str]):
    """
    Merge new datasources data into the Delta table using MERGE operation.
    
    This function performs an upsert operation based on the provided merge keys.
    
    Args:
        source_df: DataFrame with new data
        table_name: Name of the target Delta table
        merge_keys: List of column names to use for matching records
    """
    logger.info(f"Starting merge operation for {table_name}")
    
    # Create a temporary view for the merge operation
    temp_view_name = f"{table_name}_updates"
    source_df.createOrReplaceTempView(temp_view_name)
    
    # If the table is empty, just insert all records
    if spark.table(table_name).count() == 0:
        logger.info(f"Table {table_name} is empty. Inserting all records.")
        source_df.write.mode("append").saveAsTable(table_name)
        return
    
    # Build the merge condition
    merge_condition = " AND ".join([f"target.{key} = source.{key}" for key in merge_keys])
    
    # Get all columns for the UPDATE SET clause (excluding merge keys to avoid redundancy)
    all_columns = source_df.columns
    update_columns = [col for col in all_columns if col not in merge_keys]
    update_set_clause = ", ".join([f"target.{col} = source.{col}" for col in update_columns])
    
    # Perform the merge operation
    merge_query = f"""
    MERGE INTO {table_name} AS target
    USING {temp_view_name} AS source
    ON {merge_condition}
    WHEN MATCHED THEN
        UPDATE SET {update_set_clause}
    WHEN NOT MATCHED THEN
        INSERT *
    """
    
    if CONFIG['DEBUG_MODE']:
        logger.info(f"Executing merge query: {merge_query}")
    
    spark.sql(merge_query)
    logger.info(f"Merge operation completed successfully for {table_name}")


def optimize_delta_table(table_name: str):
    """
    Optimize the Delta table for better query performance.
    
    Args:
        table_name: Name of the Delta table to optimize
    """
    logger.info(f"Optimizing Delta table '{table_name}'")
    
    try:
        # Update table statistics for better query planning
        spark.sql(f"ANALYZE TABLE {table_name} COMPUTE STATISTICS")
        logger.info(f"Table statistics updated successfully for {table_name}")
        
        # Note: In Microsoft Fabric, Delta table optimization may be handled automatically
        logger.info("Note: Microsoft Fabric may automatically optimize Delta tables")
    except Exception as e:
        logger.warning(f"Table optimization step encountered an issue: {str(e)}")
        logger.info("Continuing with process - optimization is not critical for functionality")
# ==================================


# CELL 12 - Main Execution Function
# ==================================
def main():
    """
    Main execution function that orchestrates the entire process.
    
    This function:
    1. Gets dataset IDs from the existing Fabric tables
    2. Gets the authentication token
    3. Retrieves datasources for all datasets in batches
    4. Creates enhanced PySpark DataFrames
    5. Loads data into Delta Lake tables
    6. Optimizes tables for analytics
    """
    try:
        logger.info("Starting Power BI Datasources to Delta Lake process")
        
        # Step 1: Get dataset IDs from the database
        logger.info("Getting dataset IDs from existing Fabric tables...")
        dataset_ids = get_dataset_ids()
        
        if not dataset_ids:
            logger.warning("No dataset IDs found. Please check the SQL query and data availability.")
            return None, None
        
        # Step 2: Get authentication token
        logger.info("Getting access token...")
        access_token = get_access_token()
        logger.info("Successfully obtained access token")
        
        # Step 3: Process datasets in batches to retrieve datasources
        logger.info(f"Processing {len(dataset_ids)} datasets to retrieve datasources...")
        all_datasources, all_connection_details = process_datasets_in_batches(dataset_ids, access_token)
        
        # Step 4: Create DataFrames
        logger.info("Creating PySpark DataFrames...")
        datasources_df, connection_details_df = create_datasources_dataframes(all_datasources, all_connection_details)
        
        # Show sample data
        if all_datasources:
            logger.info("Sample of datasources data:")
            datasources_df.show(5, truncate=False)
        
        if all_connection_details:
            logger.info("Sample of connection details data:")
            connection_details_df.show(5, truncate=False)
        
        # Step 5: Prepare Delta tables
        main_table_name = CONFIG["DATASOURCES_TABLE_NAME"]
        details_table_name = CONFIG["CONNECTION_DETAILS_TABLE_NAME"]
        
        ensure_delta_table_exists(main_table_name, datasources_df.schema)
        ensure_delta_table_exists(details_table_name, connection_details_df.schema)
        
        # Step 6: Merge data into Delta tables (if we have data)
        if all_datasources:
            # Merge main datasources table (merge on dataset_id and datasource_id)
            merge_datasources_to_delta(datasources_df, main_table_name, ["dataset_id", "datasource_id"])
            optimize_delta_table(main_table_name)
        
        if all_connection_details:
            # Merge connection details table (merge on dataset_id, datasource_id, and connection_property)
            merge_datasources_to_delta(connection_details_df, details_table_name, 
                                     ["dataset_id", "datasource_id", "connection_property"])
            optimize_delta_table(details_table_name)
        
        # Step 7: Display final statistics
        logger.info("Loading completed successfully!")
        
        # Show table information for main datasources table
        logger.info(f"=== {main_table_name} Table Details ===")
        spark.sql(f"DESCRIBE DETAIL {main_table_name}").show(truncate=False)
        
        # Show row count and summary for main table
        main_row_count = spark.table(main_table_name).count()
        logger.info(f"Total rows in {main_table_name}: {main_row_count}")
        
        if main_row_count > 0:
            main_summary = spark.sql(f"""
                SELECT 
                    COUNT(*) as total_datasources,
                    COUNT(DISTINCT dataset_id) as unique_datasets,
                    COUNT(DISTINCT datasource_type) as unique_datasource_types,
                    COUNT(DISTINCT gateway_id) as unique_gateways,
                    MAX(extraction_date) as last_updated
                FROM {main_table_name}
            """)
            
            logger.info(f"=== {main_table_name} Summary Statistics ===")
            main_summary.show(truncate=False)
            
            # Show datasource type distribution
            type_distribution = spark.sql(f"""
                SELECT 
                    datasource_type,
                    COUNT(*) as count
                FROM {main_table_name}
                GROUP BY datasource_type
                ORDER BY count DESC
            """)
            
            logger.info("Datasource type distribution:")
            type_distribution.show(truncate=False)
        
        # Show table information for connection details table
        logger.info(f"=== {details_table_name} Table Details ===")
        spark.sql(f"DESCRIBE DETAIL {details_table_name}").show(truncate=False)
        
        # Show row count and summary for connection details table
        details_row_count = spark.table(details_table_name).count()
        logger.info(f"Total rows in {details_table_name}: {details_row_count}")
        
        if details_row_count > 0:
            details_summary = spark.sql(f"""
                SELECT 
                    COUNT(*) as total_connection_details,
                    COUNT(DISTINCT dataset_id) as unique_datasets,
                    COUNT(DISTINCT datasource_id) as unique_datasources,
                    COUNT(DISTINCT connection_property) as unique_properties,
                    MAX(extraction_date) as last_updated
                FROM {details_table_name}
            """)
            
            logger.info(f"=== {details_table_name} Summary Statistics ===")
            details_summary.show(truncate=False)
            
            # Show most common connection properties
            property_distribution = spark.sql(f"""
                SELECT 
                    connection_property,
                    COUNT(*) as count
                FROM {details_table_name}
                GROUP BY connection_property
                ORDER BY count DESC
                LIMIT 10
            """)
            
            logger.info("Top 10 connection properties:")
            property_distribution.show(truncate=False)
        
        return datasources_df, connection_details_df
        
    except Exception as e:
        logger.error(f"Error in main execution: {str(e)}")
        raise
# ==================================


# CELL 13 - Execute Main Function
# ==================================
# Execute the main function
if __name__ == "__main__":
    datasources_df, connection_details_df = main()
# ==================================



StatementMeta(, fad4e198-0861-46c0-97cc-62ecf271dbdc, 4, Finished, Available, Finished)

2025-07-17 15:37:09,778 - INFO - Starting execution with batch ID: 4fdb355c-6f7c-4135-94e3-62a3c9d2cf9a
2025-07-17 15:37:09,782 - INFO - Starting Power BI Datasources to Delta Lake process
2025-07-17 15:37:09,782 - INFO - Getting dataset IDs from existing Fabric tables...
2025-07-17 15:37:09,783 - INFO - Executing SQL query to retrieve dataset IDs...
2025-07-17 15:37:33,900 - INFO - Retrieved 147 dataset IDs from the database
2025-07-17 15:37:33,901 - INFO - Sample dataset IDs: ['d9a141b5-20e2-4c8a-825b-02c0f63bd691', 'c6c5b1e8-4f0c-400e-a396-72048f9b3871', '187c3787-c4a5-47a3-a35b-149b72bbd070', '09eac459-9b71-4e9b-8771-45a142078a0d', '36e4c8d8-d521-4577-93f8-307633711275']
2025-07-17 15:37:33,902 - INFO - Getting access token...
2025-07-17 15:37:33,921 - INFO - Successfully obtained access token
2025-07-17 15:37:33,922 - INFO - Processing 147 datasets to retrieve datasources...
2025-07-17 15:37:33,922 - INFO - Processing 147 datasets in batches of 10
2025-07-17 15:37:33,924 - INFO - 

+------------------------------------+---------------+------------------------------------+------------------------------------+-----------------+----+--------------------------+------------------------------------+
|dataset_id                          |datasource_type|datasource_id                       |gateway_id                          |connection_string|name|extraction_date           |extraction_batch_id                 |
+------------------------------------+---------------+------------------------------------+------------------------------------+-----------------+----+--------------------------+------------------------------------+
|d9a141b5-20e2-4c8a-825b-02c0f63bd691|Extension      |6270f3eb-9cd1-4028-b4eb-25713ef9c5e2|7e508ae6-3f91-43a5-ba20-e24caa4b7fd5|NULL             |NULL|2025-07-17 15:37:34.216279|4fdb355c-6f7c-4135-94e3-62a3c9d2cf9a|
|c6c5b1e8-4f0c-400e-a396-72048f9b3871|Extension      |30d92dc2-4a7b-4719-92a7-97c26a879d8a|1c72182e-146a-4a10-8bb0-3c87aa65e68b|NULL    

In [3]:
from pyspark.sql import SparkSession

# create Spark session
spark = SparkSession.builder.appName("Refresh SQL Endpoint Metadata").getOrCreate()

# refresh the specific table
spark.sql("REFRESH TABLE pbi_datasources")
print("Metadata refresh triggered successfully.")

# refresh the specific table
spark.sql("REFRESH TABLE pbi_datasources_connection_details")
print("Metadata refresh triggered successfully.")


StatementMeta(, fad4e198-0861-46c0-97cc-62ecf271dbdc, 5, Finished, Available, Finished)

Metadata refresh triggered successfully.
Metadata refresh triggered successfully.
