In [1]:
# DO NOT DELETE THIS CELL

# API Name: Admin - Datasets GetDatasetsAsAdmin
# Command:  GET https://api.powerbi.com/v1.0/myorg/admin/datasets
# Doc:      https://learn.microsoft.com/en-us/rest/api/power-bi/admin/datasets-get-datasets-as-admin

# Loads table: pbi_datasets

StatementMeta(, e62ae15b-0278-4dbe-b75e-b37192526633, 3, Finished, Available, Finished)

In [2]:
# CELL 1 - Title and Introduction
# ==================================
# Power BI Datasets to Delta Lake - PySpark Notebook
# This notebook retrieves Power BI datasets using the GetDatasetsAsAdmin API and loads them into a Delta Lake table
# with optimization for analytics workloads
# ==================================


# CELL 2 - Import Libraries
# ==================================
# Import required libraries
import requests
import json
import pandas as pd
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, current_timestamp, lit, from_json
from pyspark.sql.types import StructType, StructField, StringType, TimestampType, BooleanType
import logging
from typing import Dict, List, Optional
from delta.tables import DeltaTable
import random
import time
# ==================================


# CELL 3 - Configure Logging and Initialize Spark
# ==================================
# Configure logging
# This helps us track what's happening in our code and debug issues
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)

# Initialize Spark Session with Delta Lake configurations
# In Fabric notebooks, Spark is pre-configured with Delta support
spark = SparkSession.builder \
    .appName("PowerBIDatasetsToDeltalake") \
    .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension") \
    .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog") \
    .getOrCreate()

# Set optimal configurations for Delta operations
spark.conf.set("spark.databricks.delta.optimizeWrite.enabled", "true")
spark.conf.set("spark.databricks.delta.autoCompact.enabled", "true")
# ==================================


# CELL 4 - Configuration Parameters
# ==================================
# Configuration Parameters
# These are the settings we'll use throughout the notebook
CONFIG = {
    "API_BASE_URL": "https://api.powerbi.com/v1.0/myorg",
    "DATASETS_ENDPOINT": "/admin/datasets",  # GetDatasetsAsAdmin endpoint
    "MAX_RETRIES": 5,  # Number of retries for handling rate limits
    "INITIAL_BACKOFF_SEC": 1,  # Initial backoff time in seconds
    "MAX_BACKOFF_SEC": 60,  # Maximum backoff time in seconds
    "BACKOFF_FACTOR": 2,  # Exponential backoff multiplier
    "JITTER_FACTOR": 0.1,  # Random jitter to add to backoff (as a fraction)
    "TIMEOUT": 30,  # API request timeout in seconds
    "DATASETS_TABLE_NAME": "pbi_datasets",  # Name of the target Delta table
    "LAKEHOUSE_PATH": "Tables",  # Default Tables folder in Fabric Lakehouse
    "DEBUG_MODE": True  # Set to True to enable extra debugging output
}

# Note: The Power BI Admin API supports standard OData parameters ($top, $skip, $filter)
# We'll rely on @odata.nextLink for pagination as provided by the API
# No artificial record limits are imposed
# ==================================


# CELL 5 - Authentication Function
# ==================================
def get_access_token():
    """
    Get Azure AD access token for Power BI API authentication.
    
    In a Fabric notebook, the token is automatically available through mssparkutils.
    This function retrieves the token that's needed to authenticate with the Power BI REST API.
    
    Returns:
        str: The access token
    
    Note:
        mssparkutils is a utility library provided by Microsoft Fabric
        that handles authentication automatically.
        For Power BI API, we need a token with the scope for https://analysis.windows.net/powerbi/api
    """
    try:
        # In Fabric notebooks, we can get the token using mssparkutils
        from notebookutils import mssparkutils
        # Power BI API requires a specific resource/scope
        token_response = mssparkutils.credentials.getToken("https://analysis.windows.net/powerbi/api")
        return token_response
    except Exception as e:
        logger.error(f"Failed to get access token: {str(e)}")
        raise
# ==================================


# CELL 6 - API Call Function
# ==================================
def call_powerbi_api(endpoint: str, access_token: str, params: Optional[Dict] = None) -> Dict:
    """
    Make a REST API call to Power BI with advanced rate limit handling.
    
    This function handles the HTTP request to the Power BI API, including:
    - Setting up authentication headers
    - Managing retries with intelligent backoff for rate limiting (429 errors)
    - Implementing jitter to avoid synchronized retries
    - Detailed error handling and logging
    
    Args:
        endpoint: The API endpoint path (e.g., "/admin/datasets")
        access_token: The Azure AD access token
        params: Optional query parameters for the API call
    
    Returns:
        dict: The JSON response from the API
    
    Raises:
        requests.exceptions.RequestException: If the API call fails after all retries
    """
    url = f"{CONFIG['API_BASE_URL']}{endpoint}"
    headers = {
        "Authorization": f"Bearer {access_token}",
        "Content-Type": "application/json"
    }
    
    # Initialize backoff time
    backoff_time = CONFIG['INITIAL_BACKOFF_SEC']
    
    # Retry logic with intelligent backoff
    for attempt in range(CONFIG['MAX_RETRIES']):
        try:
            # Log the full URL with parameters for debugging
            logger.info(f"Making API call to: {url} with params: {params} (Attempt {attempt + 1})")
            
            response = requests.get(
                url,
                headers=headers,
                params=params,
                timeout=CONFIG['TIMEOUT']
            )
            
            # Log the full response for debugging
            logger.info(f"Response status: {response.status_code}")
            
            # Rate limit handling (429 Too Many Requests)
            if response.status_code == 429:
                # Get retry-after header if available, otherwise use our backoff
                retry_after = response.headers.get('Retry-After')
                
                if retry_after and retry_after.isdigit():
                    # If server specified a wait time, use it
                    wait_time = int(retry_after)
                else:
                    # Calculate wait time with exponential backoff and jitter
                    jitter = random.uniform(0, CONFIG['JITTER_FACTOR'] * backoff_time)
                    wait_time = backoff_time + jitter
                    # Update backoff for next attempt
                    backoff_time = min(backoff_time * CONFIG['BACKOFF_FACTOR'], CONFIG['MAX_BACKOFF_SEC'])
                
                logger.warning(f"Rate limit exceeded (429). Waiting {wait_time:.2f} seconds before retry.")
                time.sleep(wait_time)
                continue  # Skip to next retry attempt without raising exception
            
            # Log the response for debugging in case of errors
            if response.status_code >= 400:
                logger.error(f"API error: Status {response.status_code}, Response: {response.text}")
                logger.error(f"Request URL: {response.request.url}")
                logger.error(f"Request headers: {response.request.headers}")
            
            # For all other status codes, use raise_for_status to handle
            response.raise_for_status()
            
            # If we get here, the request was successful
            # Reset backoff for next API call (not next attempt)
            backoff_time = CONFIG['INITIAL_BACKOFF_SEC']
            
            # Log a sample of the response content for debugging
            try:
                response_json = response.json()
                if "value" in response_json and isinstance(response_json["value"], list):
                    logger.info(f"Response contains {len(response_json['value'])} items in 'value' array")
                if "@odata.nextLink" in response_json:
                    logger.info(f"Response contains @odata.nextLink for pagination")
                return response_json
            except json.JSONDecodeError as e:
                logger.error(f"Failed to parse response as JSON: {str(e)}")
                logger.error(f"Response content: {response.text[:1000]}")  # Log first 1000 chars of response
                raise
            
        except requests.exceptions.RequestException as e:
            last_attempt = attempt == CONFIG['MAX_RETRIES'] - 1
            
            # Special handling for non-429 errors
            if not (hasattr(e, 'response') and e.response is not None and e.response.status_code == 429):
                logger.warning(f"API call failed (Attempt {attempt + 1}): {str(e)}")
                
                if last_attempt:
                    logger.error(f"All retry attempts failed for endpoint: {endpoint}")
                    logger.error(f"Final error: {str(e)}")
                    raise
                
                # Calculate wait time with exponential backoff and jitter
                jitter = random.uniform(0, CONFIG['JITTER_FACTOR'] * backoff_time)
                wait_time = backoff_time + jitter
                # Update backoff for next attempt
                backoff_time = min(backoff_time * CONFIG['BACKOFF_FACTOR'], CONFIG['MAX_BACKOFF_SEC'])
                
                logger.info(f"Waiting {wait_time:.2f} seconds before retry.")
                time.sleep(wait_time)
# ==================================


# CELL 7 - Get Datasets Function
# ==================================
def get_datasets(access_token: str) -> List[Dict]:
    """
    Retrieve all datasets from the Power BI Admin API, handling pagination.
    
    This function makes requests to the GetDatasetsAsAdmin API endpoint and
    handles pagination using the @odata.nextLink to retrieve all datasets.
    
    According to the API documentation, this endpoint returns datasets from all workspaces
    that the calling user has admin permissions on.
    
    Args:
        access_token: The Azure AD access token
    
    Returns:
        list: A list of all dataset objects
    """
    all_datasets = []
    next_link = None
    page_count = 0
    
    while True:
        page_count += 1
        
        if next_link:
            # Use the @odata.nextLink URL directly for subsequent pages
            url = next_link
            
            if CONFIG['DEBUG_MODE']:
                logger.info(f"Page {page_count}: Making request with next link: {url}")
            
            # Make direct API call with proper headers and retry logic
            headers = {
                "Authorization": f"Bearer {access_token}",
                "Content-Type": "application/json"
            }
            
            # Use the same retry logic as in call_powerbi_api
            backoff_time = CONFIG['INITIAL_BACKOFF_SEC']
            
            for attempt in range(CONFIG['MAX_RETRIES']):
                try:
                    logger.info(f"Making API call to: {url} (Attempt {attempt + 1})")
                    
                    response = requests.get(
                        url,
                        headers=headers,
                        timeout=CONFIG['TIMEOUT']
                    )
                    
                    logger.info(f"Response status: {response.status_code}")
                    
                    # Handle rate limiting
                    if response.status_code == 429:
                        retry_after = response.headers.get('Retry-After')
                        if retry_after and retry_after.isdigit():
                            wait_time = int(retry_after)
                        else:
                            jitter = random.uniform(0, CONFIG['JITTER_FACTOR'] * backoff_time)
                            wait_time = backoff_time + jitter
                            backoff_time = min(backoff_time * CONFIG['BACKOFF_FACTOR'], CONFIG['MAX_BACKOFF_SEC'])
                        
                        logger.warning(f"Rate limit exceeded (429). Waiting {wait_time:.2f} seconds before retry.")
                        time.sleep(wait_time)
                        continue
                    
                    # Log errors
                    if response.status_code >= 400:
                        logger.error(f"API error: Status {response.status_code}, Response: {response.text}")
                        logger.error(f"Request URL: {response.request.url}")
                    
                    response.raise_for_status()
                    response_data = response.json()
                    break  # Success, exit retry loop
                    
                except requests.exceptions.RequestException as e:
                    if attempt == CONFIG['MAX_RETRIES'] - 1:
                        logger.error(f"All retry attempts failed for page {page_count}")
                        raise
                    
                    jitter = random.uniform(0, CONFIG['JITTER_FACTOR'] * backoff_time)
                    wait_time = backoff_time + jitter
                    backoff_time = min(backoff_time * CONFIG['BACKOFF_FACTOR'], CONFIG['MAX_BACKOFF_SEC'])
                    
                    logger.warning(f"API call failed (Attempt {attempt + 1}): {str(e)}")
                    logger.info(f"Waiting {wait_time:.2f} seconds before retry.")
                    time.sleep(wait_time)
        else:
            # First page - use the standard call_powerbi_api function
            # No artificial $top limit - let the API return its natural page size
            
            if CONFIG['DEBUG_MODE']:
                logger.info(f"Page {page_count}: Making initial request without $top limit")
            
            try:
                response_data = call_powerbi_api(CONFIG['DATASETS_ENDPOINT'], access_token)
            except requests.exceptions.RequestException as e:
                logger.error(f"API call failed on page {page_count}: {str(e)}")
                raise
        
        # Log the response structure for debugging
        if CONFIG['DEBUG_MODE']:
            logger.info(f"Response keys: {list(response_data.keys())}")
        
        # Extract datasets from the response
        # Power BI API returns datasets in the "value" array
        datasets = response_data.get("value", [])
        
        if datasets:
            all_datasets.extend(datasets)
            logger.info(f"Retrieved {len(datasets)} datasets on page {page_count}. Running total: {len(all_datasets)}")
            
            # Log first dataset for debugging
            if CONFIG['DEBUG_MODE'] and datasets:
                logger.info(f"Sample dataset keys: {list(datasets[0].keys())}")
        else:
            logger.warning(f"No datasets found on page {page_count}")
        
        # Check if there are more pages using @odata.nextLink
        next_link = response_data.get("@odata.nextLink")
        
        if next_link:
            logger.info(f"Found @odata.nextLink for pagination")
        else:
            logger.info("No @odata.nextLink found - this is the last page")
            break
    
    logger.info(f"Finished retrieving all datasets. Total count: {len(all_datasets)}")
    return all_datasets
# ==================================


# CELL 8 - Create Enhanced DataFrame Function
# ==================================
def create_enhanced_datasets_dataframe(datasets_data: List[Dict]) -> "DataFrame":
    """
    Convert the datasets data into an enhanced PySpark DataFrame for Delta Lake.
    
    This function:
    - Creates a structured DataFrame with the dataset data
    - Extracts only the required fields as specified in the requirements
    - Handles missing fields gracefully with proper data types
    - Prevents VOID column issues in Delta Lake
    - Adds metadata columns for tracking
    
    Based on the Power BI API documentation and the provided image, we extract these fields:
    - isEffectiveIdentityRequired (boolean)
    - isEffectiveIdentityRolesRequired (boolean) 
    - isInPlaceSharingEnabled (boolean)
    - isOnPremGatewayRequired (boolean)
    - isRefreshable (boolean)
    - addRowsAPIEnabled (boolean)
    - configuredBy (string)
    - createReportEmbedURL (string)
    - createdDate (string/datetime)
    - description (string)
    - id (string)
    - name (string)
    - qnaEmbedURL (string)
    - targetStorageMode (string)
    - webUrl (string)
    - workspaceId (string)
    
    Args:
        datasets_data: List of dataset dictionaries from the API
    
    Returns:
        DataFrame: An enhanced PySpark DataFrame ready for Delta Lake
    """
    # Define the target schema with explicit data types to prevent VOID issues
    target_schema = StructType([
        StructField("isEffectiveIdentityRequired", BooleanType(), True),
        StructField("isEffectiveIdentityRolesRequired", BooleanType(), True),
        StructField("isInPlaceSharingEnabled", BooleanType(), True),
        StructField("isOnPremGatewayRequired", BooleanType(), True),
        StructField("isRefreshable", BooleanType(), True),
        StructField("addRowsAPIEnabled", BooleanType(), True),
        StructField("configuredBy", StringType(), True),
        StructField("createReportEmbedURL", StringType(), True),
        StructField("createdDate", StringType(), True),
        StructField("description", StringType(), True),
        StructField("id", StringType(), True),  # Changed to nullable to handle missing IDs
        StructField("name", StringType(), True),
        StructField("qnaEmbedURL", StringType(), True),
        StructField("targetStorageMode", StringType(), True),
        StructField("webUrl", StringType(), True),
        StructField("workspaceId", StringType(), True),
        StructField("extraction_timestamp", TimestampType(), False)
    ])
    
    # First, let's examine what fields are actually available in the data
    if datasets_data and CONFIG['DEBUG_MODE']:
        sample_dataset = datasets_data[0]
        logger.info(f"Available fields in dataset: {list(sample_dataset.keys())}")
    
    # Convert the data to a DataFrame
    if not datasets_data:
        logger.warning("No datasets found. Creating empty DataFrame with predefined schema.")
        # Create an empty DataFrame with the target schema
        empty_rdd = spark.sparkContext.emptyRDD()
        enhanced_df = spark.createDataFrame(empty_rdd, target_schema)
        return enhanced_df
    
    # Create structured data with explicit type handling to prevent VOID columns
    structured_data = []
    
    # Get current timestamp once to use for all records
    from datetime import datetime
    current_ts = datetime.now()
    
    for dataset in datasets_data:
        # Create a record with explicit type casting and defaults
        record = []
        
        # Boolean fields - convert None to False or handle properly
        boolean_fields = [
            "isEffectiveIdentityRequired", "isEffectiveIdentityRolesRequired", 
            "isInPlaceSharingEnabled", "isOnPremGatewayRequired", "isRefreshable", "addRowsAPIEnabled"
        ]
        
        for field in boolean_fields:
            value = dataset.get(field)
            # Convert to proper boolean or None (not VOID)
            if value is None:
                record.append(None)
            elif isinstance(value, bool):
                record.append(value)
            elif isinstance(value, str):
                record.append(value.lower() in ['true', '1', 'yes'])
            else:
                record.append(bool(value))
        
        # String fields - ensure they're strings or None, with validation for ID
        string_fields = [
            "configuredBy", "createReportEmbedURL", "createdDate", "description", 
            "id", "name", "qnaEmbedURL", "targetStorageMode", "webUrl", "workspaceId"
        ]
        
        for field in string_fields:
            value = dataset.get(field)
            if value is None:
                # For ID field, generate a placeholder if missing
                if field == "id":
                    logger.warning(f"Missing ID for dataset: {dataset.get('name', 'Unknown')}")
                    record.append(f"missing_id_{len(structured_data)}")  # Generate unique placeholder
                else:
                    record.append(None)
            else:
                record.append(str(value))
        
        # Add extraction timestamp as actual datetime object (not None)
        record.append(current_ts)
        
        structured_data.append(record)
    
    # Log the data we're about to create
    if CONFIG['DEBUG_MODE']:
        logger.info(f"Creating DataFrame with {len(structured_data)} records")
        if structured_data:
            logger.info(f"Sample record: {structured_data[0]}")
    
    # Create the Spark DataFrame with explicit schema
    if structured_data:
        spark_df = spark.createDataFrame(structured_data, target_schema)
        # The timestamp is already set during record creation, so no need to add current_timestamp()
        enhanced_df = spark_df
    else:
        # Create empty DataFrame with schema
        empty_rdd = spark.sparkContext.emptyRDD()
        spark_df = spark.createDataFrame(empty_rdd, target_schema)
        enhanced_df = spark_df
    
    # Log final schema
    if CONFIG['DEBUG_MODE']:
        logger.info("Final DataFrame schema:")
        enhanced_df.printSchema()
        
        # Show a sample of the data
        logger.info("Sample data:")
        enhanced_df.show(2, truncate=False)
    
    return enhanced_df
# ==================================


# CELL 9 - Delta Lake Operations Functions
# ==================================
def ensure_delta_table_exists(table_name: str, df_schema):
    """
    Ensure the Delta table exists, creating it with the proper schema if necessary.
    
    This function handles VOID column issues by ensuring the table is created
    with the correct schema before any data operations.
    
    Args:
        table_name: Name of the Delta table
        df_schema: Schema of the DataFrame
    """
    try:
        # Check if table exists
        existing_table = spark.sql(f"DESCRIBE TABLE {table_name}")
        logger.info(f"Delta table '{table_name}' already exists")
        
        # Log the existing schema for comparison
        if CONFIG['DEBUG_MODE']:
            logger.info("Existing table schema:")
            existing_table.show(truncate=False)
        
    except Exception:
        # Table doesn't exist, create it with explicit schema
        logger.info(f"Creating Delta table '{table_name}' with explicit schema")
        
        # Create an empty DataFrame with the target schema to prevent VOID issues
        empty_df = spark.createDataFrame([], df_schema)
        
        # Create the Delta table with schema enforcement
        empty_df.write \
            .mode("overwrite") \
            .option("overwriteSchema", "true") \
            .option("mergeSchema", "false") \
            .saveAsTable(table_name)
        
        logger.info(f"Delta table '{table_name}' created successfully with explicit schema")
        
        # Verify the table was created correctly
        if CONFIG['DEBUG_MODE']:
            logger.info("Created table schema:")
            spark.sql(f"DESCRIBE TABLE {table_name}").show(truncate=False)


def merge_data_to_delta(source_df, table_name: str):
    """
    Merge new dataset data into the Delta table using MERGE operation.
    
    This function performs an upsert operation with explicit handling for VOID columns:
    - Updates existing records if dataset ID matches
    - Inserts new records if dataset ID doesn't exist
    - Handles schema compatibility issues
    
    Args:
        source_df: DataFrame with new data
        table_name: Name of the target Delta table
    """
    logger.info(f"Starting merge operation for {table_name}")
    
    # Ensure both DataFrames have compatible schemas
    target_df = spark.table(table_name)
    
    if CONFIG['DEBUG_MODE']:
        logger.info("Source DataFrame schema:")
        source_df.printSchema()
        logger.info("Target table schema:")
        target_df.printSchema()
    
    # Create a temporary view for the merge operation
    source_df.createOrReplaceTempView("dataset_updates")
    
    # Check if the table is empty to decide on strategy
    target_count = target_df.count()
    
    if target_count == 0:
        logger.info(f"Table {table_name} is empty. Inserting all records directly.")
        source_df.write.mode("append").saveAsTable(table_name)
        logger.info("Direct insert completed successfully")
        return
    
    # For non-empty tables, perform merge with explicit column handling
    try:
        # Perform the merge operation with explicit column references
        merge_query = f"""
        MERGE INTO {table_name} AS target
        USING dataset_updates AS source
        ON target.id = source.id
        WHEN MATCHED THEN
            UPDATE SET 
                target.isEffectiveIdentityRequired = source.isEffectiveIdentityRequired,
                target.isEffectiveIdentityRolesRequired = source.isEffectiveIdentityRolesRequired,
                target.isInPlaceSharingEnabled = source.isInPlaceSharingEnabled,
                target.isOnPremGatewayRequired = source.isOnPremGatewayRequired,
                target.isRefreshable = source.isRefreshable,
                target.addRowsAPIEnabled = source.addRowsAPIEnabled,
                target.configuredBy = source.configuredBy,
                target.createReportEmbedURL = source.createReportEmbedURL,
                target.createdDate = source.createdDate,
                target.description = source.description,
                target.name = source.name,
                target.qnaEmbedURL = source.qnaEmbedURL,
                target.targetStorageMode = source.targetStorageMode,
                target.webUrl = source.webUrl,
                target.workspaceId = source.workspaceId,
                target.extraction_timestamp = source.extraction_timestamp
        WHEN NOT MATCHED THEN
            INSERT (
                isEffectiveIdentityRequired, isEffectiveIdentityRolesRequired, 
                isInPlaceSharingEnabled, isOnPremGatewayRequired, isRefreshable,
                addRowsAPIEnabled, configuredBy, createReportEmbedURL, 
                createdDate, description, id, name, qnaEmbedURL,
                targetStorageMode, webUrl, workspaceId, extraction_timestamp
            )
            VALUES (
                source.isEffectiveIdentityRequired, source.isEffectiveIdentityRolesRequired,
                source.isInPlaceSharingEnabled, source.isOnPremGatewayRequired, source.isRefreshable,
                source.addRowsAPIEnabled, source.configuredBy, source.createReportEmbedURL,
                source.createdDate, source.description, source.id, source.name, source.qnaEmbedURL,
                source.targetStorageMode, source.webUrl, source.workspaceId, source.extraction_timestamp
            )
        """
        
        spark.sql(merge_query)
        logger.info("Merge operation completed successfully")
        
    except Exception as e:
        logger.error(f"Merge operation failed: {str(e)}")
        logger.info("Attempting fallback approach with INSERT OVERWRITE")
        
        # Fallback: Overwrite the entire table (for smaller datasets)
        try:
            source_df.write \
                .mode("overwrite") \
                .option("overwriteSchema", "true") \
                .saveAsTable(table_name)
            logger.info("Fallback INSERT OVERWRITE completed successfully")
        except Exception as fallback_error:
            logger.error(f"Fallback approach also failed: {str(fallback_error)}")
            raise


def optimize_delta_table(table_name: str):
    """
    Optimize the Delta table for better query performance.
    
    This function:
    - Updates table statistics for query optimization
    - Uses a more compatible method for Microsoft Fabric
    
    Args:
        table_name: Name of the Delta table to optimize
    """
    logger.info(f"Optimizing Delta table '{table_name}'")
    
    try:
        # Update table statistics for better query planning
        spark.sql(f"ANALYZE TABLE {table_name} COMPUTE STATISTICS")
        logger.info("Table statistics updated successfully")
        
        # Note: In Microsoft Fabric, Delta table optimization may be handled automatically
        # or through different commands than traditional Delta Lake
        
        # Alternative approach: Use Delta table properties to hint at optimization
        delta_table = DeltaTable.forName(spark, table_name)
        delta_table_detail = spark.sql(f"DESCRIBE DETAIL {table_name}")
        
        logger.info("Delta table optimization completed via statistics computation")
        logger.info("Note: Microsoft Fabric may automatically optimize Delta tables")
    except Exception as e:
        logger.warning(f"Table optimization step encountered an issue: {str(e)}")
        logger.info("Continuing with process - optimization is not critical for functionality")
# ==================================


# CELL 10 - Main Execution Function
# ==================================
def main():
    """
    Main execution function that orchestrates the entire process.
    
    This function:
    1. Gets the authentication token
    2. Retrieves all datasets from the Power BI Admin API
    3. Creates an enhanced PySpark DataFrame with the dataset data
    4. Loads data into a Delta Lake table
    5. Optimizes the table for analytics
    """
    try:
        logger.info("Starting Power BI Datasets to Delta Lake process")
        
        # Step 1: Get authentication token
        logger.info("Getting access token...")
        access_token = get_access_token()
        logger.info("Successfully obtained access token")
        
        # Step 2: Retrieve all datasets
        logger.info("Retrieving datasets from Power BI Admin API...")
        datasets_data = get_datasets(access_token)
        
        if not datasets_data:
            logger.warning("No datasets found. Please check your permissions and API access.")
            # Create empty dataframe with schema for consistent table structure
            empty_schema = StructType([
                StructField("isEffectiveIdentityRequired", BooleanType(), True),
                StructField("isEffectiveIdentityRolesRequired", BooleanType(), True),
                StructField("isInPlaceSharingEnabled", BooleanType(), True),
                StructField("isOnPremGatewayRequired", BooleanType(), True),
                StructField("isRefreshable", BooleanType(), True),
                StructField("addRowsAPIEnabled", BooleanType(), True),
                StructField("configuredBy", StringType(), True),
                StructField("createReportEmbedURL", StringType(), True),
                StructField("createdDate", StringType(), True),
                StructField("description", StringType(), True),
                StructField("id", StringType(), True),
                StructField("name", StringType(), True),
                StructField("qnaEmbedURL", StringType(), True),
                StructField("targetStorageMode", StringType(), True),
                StructField("webUrl", StringType(), True),
                StructField("workspaceId", StringType(), True),
                StructField("extraction_timestamp", TimestampType(), False)
            ])
            datasets_df = spark.createDataFrame([], empty_schema)
        else:
            # Step 3: Create enhanced DataFrame
            logger.info(f"Creating DataFrame for {len(datasets_data)} datasets...")
            datasets_df = create_enhanced_datasets_dataframe(datasets_data)
        
        # Show sample data
        logger.info("Sample of enhanced datasets data:")
        datasets_df.show(5, truncate=False)
        
        # Step 4: Prepare Delta table
        table_name = CONFIG["DATASETS_TABLE_NAME"]
        ensure_delta_table_exists(table_name, datasets_df.schema)
        
        # Step 5: Merge data into Delta table (if we have data)
        if datasets_data:
            merge_data_to_delta(datasets_df, table_name)
            
            # Step 6: Optimize the Delta table
            optimize_delta_table(table_name)
        
        # Step 7: Display final statistics
        logger.info("Loading completed successfully!")
        
        # Show table information
        spark.sql(f"DESCRIBE DETAIL {table_name}").show(truncate=False)
        
        # Show row count
        row_count = spark.table(table_name).count()
        logger.info(f"Total rows in {table_name}: {row_count}")
        
        # Show summary statistics
        summary_stats = spark.sql(f"""
            SELECT 
                COUNT(*) as total_datasets,
                COUNT(DISTINCT workspaceId) as unique_workspaces,
                COUNT(DISTINCT configuredBy) as unique_owners,
                COUNT(DISTINCT targetStorageMode) as storage_modes,
                SUM(CASE WHEN isRefreshable = true THEN 1 ELSE 0 END) as refreshable_datasets,
                SUM(CASE WHEN isOnPremGatewayRequired = true THEN 1 ELSE 0 END) as onprem_gateway_required,
                MAX(extraction_timestamp) as last_updated
            FROM {table_name}
            WHERE 1=1  -- Always true condition to make query more robust
        """)
        
        logger.info("Summary statistics:")
        try:
            summary_stats.show(truncate=False)
        except Exception as e:
            logger.warning(f"Could not display summary statistics: {str(e)}")
            # Try a simpler query
            simple_stats = spark.sql(f"SELECT COUNT(*) as total_datasets FROM {table_name}")
            simple_stats.show()
        
        # Optional: Show distribution by storage mode (only if column exists)
        try:
            storage_distribution = spark.sql(f"""
                SELECT 
                    targetStorageMode,
                    COUNT(*) as count
                FROM {table_name}
                WHERE targetStorageMode IS NOT NULL
                GROUP BY targetStorageMode
                ORDER BY count DESC
            """)
            
            logger.info("Dataset distribution by storage mode:")
            storage_distribution.show(truncate=False)
        except Exception as e:
            logger.warning(f"Could not display storage mode distribution: {str(e)}")
        
        # Optional: Show refreshable vs non-refreshable datasets (only if column exists)
        try:
            refresh_distribution = spark.sql(f"""
                SELECT 
                    isRefreshable,
                    COUNT(*) as count
                FROM {table_name}
                GROUP BY isRefreshable
                ORDER BY count DESC
            """)
            
            logger.info("Dataset distribution by refresh capability:")
            refresh_distribution.show(truncate=False)
        except Exception as e:
            logger.warning(f"Could not display refresh distribution: {str(e)}")
        
        # Show the actual columns in the table
        logger.info("Actual table schema:")
        spark.sql(f"DESCRIBE {table_name}").show(truncate=False)
        
        return datasets_df
        
    except Exception as e:
        logger.error(f"Error in main execution: {str(e)}")
        raise
# ==================================


# CELL 11 - Execute Main Function
# ==================================
# Execute the main function
if __name__ == "__main__":
    datasets_df = main()
# ==================================

StatementMeta(, e62ae15b-0278-4dbe-b75e-b37192526633, 4, Finished, Available, Finished)

2025-07-17 15:21:02,383 - INFO - Starting Power BI Datasets to Delta Lake process
2025-07-17 15:21:02,383 - INFO - Getting access token...
2025-07-17 15:21:02,398 - INFO - Successfully obtained access token
2025-07-17 15:21:02,399 - INFO - Retrieving datasets from Power BI Admin API...
2025-07-17 15:21:02,400 - INFO - Page 1: Making initial request without $top limit
2025-07-17 15:21:02,400 - INFO - Making API call to: https://api.powerbi.com/v1.0/myorg/admin/datasets with params: None (Attempt 1)
2025-07-17 15:21:32,820 - INFO - Merge operation completed successfully
2025-07-17 15:21:32,821 - INFO - Optimizing Delta table 'pbi_datasets'
2025-07-17 15:21:36,103 - INFO - Table statistics updated successfully
2025-07-17 15:21:37,450 - INFO - Total rows in pbi_datasets: 11460
2025-07-17 15:21:37,661 - INFO - Summary statistics:
2025-07-17 15:21:40,777 - INFO - Actual table schema:


root
 |-- isEffectiveIdentityRequired: boolean (nullable = true)
 |-- isEffectiveIdentityRolesRequired: boolean (nullable = true)
 |-- isInPlaceSharingEnabled: boolean (nullable = true)
 |-- isOnPremGatewayRequired: boolean (nullable = true)
 |-- isRefreshable: boolean (nullable = true)
 |-- addRowsAPIEnabled: boolean (nullable = true)
 |-- configuredBy: string (nullable = true)
 |-- createReportEmbedURL: string (nullable = true)
 |-- createdDate: string (nullable = true)
 |-- description: string (nullable = true)
 |-- id: string (nullable = true)
 |-- name: string (nullable = true)
 |-- qnaEmbedURL: string (nullable = true)
 |-- targetStorageMode: string (nullable = true)
 |-- webUrl: string (nullable = true)
 |-- workspaceId: string (nullable = true)
 |-- extraction_timestamp: timestamp (nullable = false)

+---------------------------+--------------------------------+-----------------------+-----------------------+-------------+-----------------+------------------------+-------------

In [3]:
from pyspark.sql import SparkSession

# create Spark session
spark = SparkSession.builder.appName("Refresh SQL Endpoint Metadata").getOrCreate()

# refresh the specific table
spark.sql("REFRESH TABLE pbi_datasets")
print("Metadata refresh triggered successfully.")


StatementMeta(, e62ae15b-0278-4dbe-b75e-b37192526633, 5, Finished, Available, Finished)

Metadata refresh triggered successfully.
