In [1]:
# DO NOT DELETE THIS CELL

# API Name: Admin - Dataflows GetDataflowsAsAdmin
# Command:  GET https://api.powerbi.com/v1.0/myorg/admin/dataflows
# Doc:      https://learn.microsoft.com/en-us/rest/api/power-bi/admin/dataflows-get-dataflows-as-admin

# Loads table: pbi_dataflows

StatementMeta(, 08f7bbd2-3093-4584-a86a-d1d10b5b1bdd, 3, Finished, Available, Finished)

In [2]:
# CELL 1 - Title and Introduction
# ==================================
# Power BI Dataflows to Delta Lake - PySpark Notebook
# This notebook retrieves Power BI dataflows using GetDataflowsAsAdmin API and loads them 
# into a Delta Lake table with optimization for analytics workloads
# 
# Table Created:
# - pbi_dataflows: Core dataflow metadata
# ==================================


# CELL 2 - Import Libraries
# ==================================
# Import required libraries
import requests
import json
import pandas as pd
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, current_timestamp, lit, explode, when, coalesce
from pyspark.sql.types import StructType, StructField, StringType, TimestampType, ArrayType
import logging
from typing import Dict, List, Optional
from delta.tables import DeltaTable
import random
import time
import uuid
from datetime import datetime
# ==================================


# CELL 3 - Configure Logging and Initialize Spark
# ==================================
# Configure logging
# This helps us track what's happening in our code and debug issues
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)

# Initialize Spark Session with Delta Lake configurations
# In Fabric notebooks, Spark is pre-configured with Delta support
spark = SparkSession.builder \
    .appName("PowerBIDataflowsToDelta") \
    .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension") \
    .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog") \
    .getOrCreate()

# Set optimal configurations for Delta operations
spark.conf.set("spark.databricks.delta.optimizeWrite.enabled", "true")
spark.conf.set("spark.databricks.delta.autoCompact.enabled", "true")
# ==================================


# CELL 4 - Configuration Parameters
# ==================================
# Configuration Parameters
# These are the settings we'll use throughout the notebook
CONFIG = {
    "API_BASE_URL": "https://api.powerbi.com/v1.0/myorg",
    "DATAFLOWS_ENDPOINT": "/admin/dataflows",  # GetDataflowsAsAdmin endpoint
    "MAX_RETRIES": 5,  # Number of retries for handling rate limits
    "INITIAL_BACKOFF_SEC": 1,  # Initial backoff time in seconds
    "MAX_BACKOFF_SEC": 60,  # Maximum backoff time in seconds
    "BACKOFF_FACTOR": 2,  # Exponential backoff multiplier
    "JITTER_FACTOR": 0.1,  # Random jitter to add to backoff (as a fraction)
    "TIMEOUT": 30,  # API request timeout in seconds
    "DATAFLOWS_TABLE_NAME": "pbi_dataflows",  # Primary table for dataflow metadata
    "LAKEHOUSE_PATH": "Tables",  # Default Tables folder in Fabric Lakehouse
    "DEBUG_MODE": True,  # Set to True to enable extra debugging output
    "MAX_REQUESTS_PER_HOUR": 200  # API rate limit
}

# Generate a unique batch ID for this extraction run (removed - no longer needed)
logger.info("Starting Power BI Dataflows extraction")
# ==================================


# CELL 5 - Authentication Function
# ==================================
def get_access_token():
    """
    Get Azure AD access token for Power BI API authentication.
    
    In a Fabric notebook, the token is automatically available through mssparkutils.
    This function retrieves the token that's needed to authenticate with the Power BI REST API.
    
    Returns:
        str: The access token
    
    Note:
        mssparkutils is a utility library provided by Microsoft Fabric
        that handles authentication automatically.
    """
    try:
        # In Fabric notebooks, we can get the token using mssparkutils
        from notebookutils import mssparkutils
        # For Power BI API, we need the Power BI service resource
        token_response = mssparkutils.credentials.getToken("https://analysis.windows.net/powerbi/api")
        return token_response
    except Exception as e:
        logger.error(f"Failed to get access token: {str(e)}")
        raise
# ==================================


# CELL 6 - API Call Function
# ==================================
def call_powerbi_api(endpoint: str, access_token: str, params: Optional[Dict] = None) -> Dict:
    """
    Make a REST API call to Power BI with advanced rate limit handling.
    
    This function handles the HTTP request to the Power BI API, including:
    - Setting up authentication headers
    - Managing retries with intelligent backoff for rate limiting (429 errors)
    - Implementing jitter to avoid synchronized retries
    - Detailed error handling and logging
    - Respecting the 200 requests per hour limit
    
    Args:
        endpoint: The API endpoint path (e.g., "/admin/dataflows")
        access_token: The Azure AD access token
        params: Optional query parameters for the API call
    
    Returns:
        dict: The JSON response from the API
    
    Raises:
        requests.exceptions.RequestException: If the API call fails after all retries
    """
    url = f"{CONFIG['API_BASE_URL']}{endpoint}"
    headers = {
        "Authorization": f"Bearer {access_token}",
        "Content-Type": "application/json"
    }
    
    # Initialize backoff time
    backoff_time = CONFIG['INITIAL_BACKOFF_SEC']
    
    # Retry logic with intelligent backoff
    for attempt in range(CONFIG['MAX_RETRIES']):
        try:
            # Log the full URL with parameters for debugging
            logger.info(f"Making API call to: {url} with params: {params} (Attempt {attempt + 1})")
            
            response = requests.get(
                url,
                headers=headers,
                params=params,
                timeout=CONFIG['TIMEOUT']
            )
            
            # Log the full response for debugging
            logger.info(f"Response status: {response.status_code}")
            
            # Rate limit handling (429 Too Many Requests)
            if response.status_code == 429:
                # Get retry-after header if available, otherwise use our backoff
                retry_after = response.headers.get('Retry-After')
                
                if retry_after and retry_after.isdigit():
                    # If server specified a wait time, use it
                    wait_time = int(retry_after)
                else:
                    # Calculate wait time with exponential backoff and jitter
                    jitter = random.uniform(0, CONFIG['JITTER_FACTOR'] * backoff_time)
                    wait_time = backoff_time + jitter
                    # Update backoff for next attempt
                    backoff_time = min(backoff_time * CONFIG['BACKOFF_FACTOR'], CONFIG['MAX_BACKOFF_SEC'])
                
                logger.warning(f"Rate limit exceeded (429). Waiting {wait_time:.2f} seconds before retry.")
                time.sleep(wait_time)
                continue  # Skip to next retry attempt without raising exception
            
            # Log the response for debugging in case of errors
            if response.status_code >= 400:
                logger.error(f"API error: Status {response.status_code}, Response: {response.text}")
                logger.error(f"Request URL: {response.request.url}")
                logger.error(f"Request headers: {response.request.headers}")
            
            # For all other status codes, use raise_for_status to handle
            response.raise_for_status()
            
            # If we get here, the request was successful
            # Reset backoff for next API call (not next attempt)
            backoff_time = CONFIG['INITIAL_BACKOFF_SEC']
            
            # Log a sample of the response content for debugging
            try:
                response_json = response.json()
                if "value" in response_json and isinstance(response_json["value"], list):
                    logger.info(f"Response contains {len(response_json['value'])} items in 'value' array")
                return response_json
            except json.JSONDecodeError as e:
                logger.error(f"Failed to parse response as JSON: {str(e)}")
                logger.error(f"Response content: {response.text[:1000]}")  # Log first 1000 chars of response
                raise
            
        except requests.exceptions.RequestException as e:
            last_attempt = attempt == CONFIG['MAX_RETRIES'] - 1
            
            # Special handling for non-429 errors
            if not (hasattr(e, 'response') and e.response is not None and e.response.status_code == 429):
                logger.warning(f"API call failed (Attempt {attempt + 1}): {str(e)}")
                
                if last_attempt:
                    logger.error(f"All retry attempts failed for endpoint: {endpoint}")
                    logger.error(f"Final error: {str(e)}")
                    raise
                
                # Calculate wait time with exponential backoff and jitter
                jitter = random.uniform(0, CONFIG['JITTER_FACTOR'] * backoff_time)
                wait_time = backoff_time + jitter
                # Update backoff for next attempt
                backoff_time = min(backoff_time * CONFIG['BACKOFF_FACTOR'], CONFIG['MAX_BACKOFF_SEC'])
                
                logger.info(f"Waiting {wait_time:.2f} seconds before retry.")
                time.sleep(wait_time)
# ==================================


# CELL 7 - Get Dataflows Function
# ==================================
def get_dataflows(access_token: str) -> List[Dict]:
    """
    Retrieve all dataflows from the Power BI API using GetDataflowsAsAdmin.
    
    This function makes a request to the GetDataflowsAsAdmin API endpoint to retrieve 
    all dataflows. The API returns all dataflows in a single response without pagination.
    The API has a rate limit of 200 requests per hour.
    
    Args:
        access_token: The Azure AD access token
    
    Returns:
        list: A list of all dataflow objects
    """
    logger.info("Retrieving all dataflows from Power BI API...")
    
    try:
        # Make the API call without any pagination parameters
        # The GetDataflowsAsAdmin API returns all dataflows in a single response
        response_data = call_powerbi_api(CONFIG['DATAFLOWS_ENDPOINT'], access_token)
    except requests.exceptions.RequestException as e:
        logger.error(f"API call failed: {str(e)}")
        raise
    
    # Log the response structure for debugging
    if CONFIG['DEBUG_MODE']:
        logger.info(f"Response keys: {list(response_data.keys())}")
    
    # Extract dataflows from the response
    # Power BI API returns data in a "value" array
    dataflows = response_data.get("value", [])
    
    if dataflows:
        logger.info(f"Retrieved {len(dataflows)} dataflows from API")
        
        # Log first dataflow for debugging
        if CONFIG['DEBUG_MODE'] and dataflows:
            logger.info(f"Sample dataflow: {json.dumps(dataflows[0], indent=2)}")
    else:
        logger.warning("No dataflows found in API response")
    
    logger.info(f"Finished retrieving all dataflows. Total count: {len(dataflows)}")
    return dataflows
# ==================================


# CELL 8 - Create Dataflows DataFrame Function
# ==================================
def create_dataflows_dataframe(dataflows_data: List[Dict]) -> "DataFrame":
    """
    Convert the dataflows data into a PySpark DataFrame for the pbi_dataflows table.
    
    This function:
    - Creates a structured DataFrame with the core dataflow metadata
    - Maps objectId to dataflowId as specified
    - Adds metadata columns for tracking
    - Handles nullable fields appropriately
    - Ensures all expected columns exist even if missing from API response
    - Maintains proper data types to avoid VOID column issues
    
    Args:
        dataflows_data: List of dataflow dictionaries from the API
    
    Returns:
        DataFrame: A PySpark DataFrame ready for the pbi_dataflows Delta table
    """
    # Define the schema first to ensure proper data types
    dataflows_schema = StructType([
        StructField("dataflowId", StringType(), False),     # Primary key, not nullable
        StructField("name", StringType(), True),            # Nullable
        StructField("description", StringType(), True),     # Nullable
        StructField("modelUrl", StringType(), True),        # Nullable
        StructField("configuredBy", StringType(), True),    # Nullable
        StructField("workspaceId", StringType(), True),     # Nullable
        StructField("extraction_timestamp", TimestampType(), False)    # Not nullable
    ])
    
    # Handle empty data case
    if not dataflows_data:
        logger.warning("No dataflows found. Creating empty DataFrame.")
        empty_rdd = spark.sparkContext.emptyRDD()
        dataflows_df = spark.createDataFrame(empty_rdd, dataflows_schema)
        return dataflows_df
    
    # Extract core dataflow fields
    simplified_dataflows = []
    
    for dataflow in dataflows_data:
        # Map the API response fields to our table schema
        # Handle cases where fields might be missing from the API response
        simplified_dataflow = {
            "dataflowId": dataflow.get("objectId"),  # Rename objectId to dataflowId
            "name": dataflow.get("name"),
            "description": dataflow.get("description"),
            "modelUrl": dataflow.get("modelUrl"),  # This field might not always be present
            "configuredBy": dataflow.get("configuredBy"),
            "workspaceId": dataflow.get("workspaceId")
        }
        simplified_dataflows.append(simplified_dataflow)
    
    # Convert to pandas DataFrame first for easier handling
    pandas_df = pd.DataFrame(simplified_dataflows)
    
    # Ensure all required columns exist - add missing columns with None values
    required_columns = ["dataflowId", "name", "description", "modelUrl", "configuredBy", "workspaceId"]
    for col_name in required_columns:
        if col_name not in pandas_df.columns:
            logger.warning(f"Column '{col_name}' not found in API response. Adding as NULL column.")
            pandas_df[col_name] = None
    
    # Log the actual columns we received for debugging
    if CONFIG['DEBUG_MODE']:
        logger.info(f"Pandas DataFrame columns: {list(pandas_df.columns)}")
        logger.info(f"Sample record: {pandas_df.iloc[0].to_dict() if len(pandas_df) > 0 else 'No records'}")
    
    # Create the Spark DataFrame with explicit schema to avoid VOID type issues
    # We'll create it step by step to ensure proper typing
    
    # First, create a basic DataFrame from pandas
    temp_df = spark.createDataFrame(pandas_df[required_columns])
    
    # Now cast all columns to the correct types to avoid VOID issues
    typed_df = temp_df.select(
        col("dataflowId").cast(StringType()).alias("dataflowId"),
        col("name").cast(StringType()).alias("name"),
        col("description").cast(StringType()).alias("description"),
        col("modelUrl").cast(StringType()).alias("modelUrl"),  # Explicitly cast to StringType
        col("configuredBy").cast(StringType()).alias("configuredBy"),
        col("workspaceId").cast(StringType()).alias("workspaceId")
    )
    
    # Add metadata columns
    dataflows_df = typed_df \
        .withColumn("extraction_timestamp", current_timestamp())
    
    # Log the final DataFrame schema for debugging
    if CONFIG['DEBUG_MODE']:
        logger.info(f"Final DataFrame schema: {dataflows_df.schema}")
        logger.info(f"DataFrame count: {dataflows_df.count()}")
    
    return dataflows_df
# ==================================


# CELL 9 - Delta Lake Operations Functions
# ==================================
def ensure_delta_table_exists(table_name: str, df_schema):
    """
    Ensure the Delta table exists with the correct schema, creating it if necessary.
    If the table exists but has VOID columns, recreate it with proper types.
    
    Args:
        table_name: Name of the Delta table
        df_schema: Schema of the DataFrame
    """
    try:
        # Check if table exists
        existing_table = spark.table(table_name)
        existing_schema = existing_table.schema
        
        # Check for VOID columns in existing schema
        has_void_columns = any(field.dataType.typeName() == 'void' for field in existing_schema.fields)
        
        if has_void_columns:
            logger.warning(f"Table '{table_name}' exists but has VOID columns. Recreating with proper schema.")
            
            # Drop the existing table
            spark.sql(f"DROP TABLE IF EXISTS {table_name}")
            logger.info(f"Dropped existing table '{table_name}' with VOID columns")
            
            # Create new table with correct schema
            empty_df = spark.createDataFrame([], df_schema)
            empty_df.write \
                .mode("overwrite") \
                .option("overwriteSchema", "true") \
                .saveAsTable(table_name)
            
            logger.info(f"Recreated Delta table '{table_name}' with proper schema")
        else:
            logger.info(f"Delta table '{table_name}' already exists with correct schema")
            
    except Exception as table_check_error:
        # Table doesn't exist, create it
        logger.info(f"Creating Delta table '{table_name}' - {str(table_check_error)}")
        
        # Create an empty DataFrame with the schema
        empty_df = spark.createDataFrame([], df_schema)
        
        # Create the Delta table
        empty_df.write \
            .mode("overwrite") \
            .option("overwriteSchema", "true") \
            .saveAsTable(table_name)
        
        logger.info(f"Delta table '{table_name}' created successfully")


def merge_dataflows_to_delta(source_df, table_name: str):
    """
    Merge new dataflow data into the pbi_dataflows Delta table using MERGE operation.
    
    This function performs an upsert operation:
    - Updates existing records if dataflowId matches
    - Inserts new records if dataflowId doesn't exist
    - Handles schema compatibility issues
    
    Args:
        source_df: DataFrame with new dataflow data
        table_name: Name of the target Delta table
    """
    logger.info(f"Starting merge operation for {table_name}")
    
    # First, let's check if we have any data to merge
    source_count = source_df.count()
    if source_count == 0:
        logger.warning("Source DataFrame is empty. Skipping merge operation.")
        return
    
    logger.info(f"Source DataFrame has {source_count} records")
    
    # Get the target table
    target_df = spark.table(table_name)
    target_count = target_df.count()
    
    logger.info(f"Target table has {target_count} records")
    
    # Create a temporary view for the merge operation
    source_df.createOrReplaceTempView("dataflow_updates")
    
    # If the table is empty, just insert all records
    if target_count == 0:
        logger.info(f"Table {table_name} is empty. Inserting all records.")
        source_df.write.mode("append").saveAsTable(table_name)
        return
    
    # Perform the merge operation with explicit column mapping
    merge_query = f"""
    MERGE INTO {table_name} AS target
    USING dataflow_updates AS source
    ON target.dataflowId = source.dataflowId
    WHEN MATCHED THEN
        UPDATE SET 
            target.name = source.name,
            target.description = source.description,
            target.modelUrl = source.modelUrl,
            target.configuredBy = source.configuredBy,
            target.workspaceId = source.workspaceId,
            target.extraction_timestamp = source.extraction_timestamp
    WHEN NOT MATCHED THEN
        INSERT (dataflowId, name, description, modelUrl, configuredBy, workspaceId, extraction_timestamp)
        VALUES (source.dataflowId, source.name, source.description, source.modelUrl, source.configuredBy, source.workspaceId, source.extraction_timestamp)
    """
    
    try:
        spark.sql(merge_query)
        logger.info("Merge operation completed successfully")
    except Exception as merge_error:
        logger.error(f"Merge operation failed: {str(merge_error)}")
        logger.info("Attempting to recreate table and insert all data...")
        
        # If merge fails, we'll recreate the table and insert all data
        # This handles cases where schema is incompatible
        try:
            # Save current data from target table if it exists and has valid schema
            current_data = None
            try:
                current_data = spark.table(table_name)
                current_schema_valid = not any(field.dataType.typeName() == 'void' for field in current_data.schema.fields)
                if not current_schema_valid:
                    current_data = None
                    logger.info("Current table has invalid schema, will not preserve data")
            except:
                current_data = None
            
            # Drop and recreate table
            spark.sql(f"DROP TABLE IF EXISTS {table_name}")
            
            # Create new table with source data
            source_df.write \
                .mode("overwrite") \
                .option("overwriteSchema", "true") \
                .saveAsTable(table_name)
            
            # If we had valid current data, append it (avoiding duplicates)
            if current_data is not None and current_data.count() > 0:
                logger.info("Attempting to preserve existing valid data...")
                try:
                    # Anti-join to get records that don't exist in source
                    existing_unique = current_data.join(
                        source_df.select("dataflowId"), 
                        on="dataflowId", 
                        how="left_anti"
                    )
                    
                    if existing_unique.count() > 0:
                        existing_unique.write.mode("append").saveAsTable(table_name)
                        logger.info(f"Preserved {existing_unique.count()} existing records")
                except Exception as preserve_error:
                    logger.warning(f"Could not preserve existing data: {str(preserve_error)}")
            
            logger.info("Table recreated successfully with new data")
            
        except Exception as recreate_error:
            logger.error(f"Failed to recreate table: {str(recreate_error)}")
            raise


def merge_users_to_delta(source_df, table_name: str):
    """
    This function has been removed as the pbi_dataflows_users table is no longer needed.
    """
    pass


def optimize_delta_table(table_name: str):
    """
    Optimize the Delta table for better query performance.
    
    This function:
    - Updates table statistics for query optimization
    - Uses a more compatible method for Microsoft Fabric
    
    Args:
        table_name: Name of the Delta table to optimize
    """
    logger.info(f"Optimizing Delta table '{table_name}'")
    
    try:
        # Update table statistics for better query planning
        spark.sql(f"ANALYZE TABLE {table_name} COMPUTE STATISTICS")
        logger.info("Table statistics updated successfully")
        
        # Note: In Microsoft Fabric, Delta table optimization may be handled automatically
        # or through different commands than traditional Delta Lake
        logger.info("Delta table optimization completed via statistics computation")
        logger.info("Note: Microsoft Fabric may automatically optimize Delta tables")
    except Exception as e:
        logger.warning(f"Table optimization step encountered an issue: {str(e)}")
        logger.info("Continuing with process - optimization is not critical for functionality")
# ==================================


# CELL 10 - Main Execution Function
# ==================================
def main():
    """
    Main execution function that orchestrates the entire process.
    
    This function:
    1. Gets the authentication token
    2. Retrieves all dataflows from the Power BI API
    3. Creates DataFrame for the dataflows table
    4. Loads data into Delta Lake table
    5. Optimizes the table for analytics
    """
    try:
        logger.info("Starting Power BI Dataflows to Delta Lake process")
        
        # Step 1: Get authentication token
        logger.info("Getting access token...")
        access_token = get_access_token()
        logger.info("Successfully obtained access token")
        
        # Step 2: Retrieve all dataflows
        logger.info("Retrieving dataflows from Power BI API...")
        dataflows_data = get_dataflows(access_token)
        
        if not dataflows_data:
            logger.warning("No dataflows found. Please check your permissions and API access.")
            # Create empty dataframe with schema for consistent table structure
            dataflows_df = create_dataflows_dataframe([])
        else:
            # Step 3: Create DataFrame
            logger.info(f"Creating DataFrame for {len(dataflows_data)} dataflows...")
            dataflows_df = create_dataflows_dataframe(dataflows_data)
        
        # Show sample data
        logger.info("Sample of dataflows data:")
        dataflows_df.show(5, truncate=False)
        
        # Step 4: Prepare Delta table
        dataflows_table = CONFIG["DATAFLOWS_TABLE_NAME"]
        ensure_delta_table_exists(dataflows_table, dataflows_df.schema)
        
        # Step 5: Merge data into Delta table (if we have data)
        if dataflows_data:
            # Load dataflows data
            merge_dataflows_to_delta(dataflows_df, dataflows_table)
            
            # Step 6: Optimize the Delta table
            optimize_delta_table(dataflows_table)
        
        # Step 7: Display final statistics
        logger.info("Loading completed successfully!")
        
        # Show table information
        logger.info(f"\n=== {dataflows_table} Table Details ===")
        spark.sql(f"DESCRIBE DETAIL {dataflows_table}").show(truncate=False)
        
        # Show row count
        dataflows_count = spark.table(dataflows_table).count()
        logger.info(f"Total rows in {dataflows_table}: {dataflows_count}")
        
        # Show comprehensive summary statistics
        try:
            summary_stats = spark.sql(f"""
                SELECT 
                    COUNT(*) as total_dataflows,
                    COUNT(DISTINCT workspaceId) as unique_workspaces,
                    COUNT(DISTINCT configuredBy) as unique_owners,
                    COUNT(CASE WHEN description IS NOT NULL AND description != '' THEN 1 END) as dataflows_with_description,
                    COUNT(CASE WHEN modelUrl IS NOT NULL AND modelUrl != '' THEN 1 END) as dataflows_with_model_url,
                    MAX(extraction_timestamp) as last_updated
                FROM {dataflows_table}
            """)
            
            logger.info(f"\n=== {dataflows_table} Summary Statistics ===")
            summary_stats.show(truncate=False)
        except Exception as e:
            logger.warning(f"Error generating summary statistics: {str(e)}")
            # Show basic count instead
            logger.info(f"Basic row count for {dataflows_table}: {dataflows_count}")
        
        # Show workspace distribution for dataflows
        try:
            workspace_distribution = spark.sql(f"""
                SELECT 
                    workspaceId,
                    COUNT(*) as dataflow_count
                FROM {dataflows_table}
                WHERE workspaceId IS NOT NULL
                GROUP BY workspaceId
                ORDER BY dataflow_count DESC
                LIMIT 10
            """)
            
            logger.info(f"\n=== Top 10 Workspaces by Dataflow Count ===")
            workspace_distribution.show(truncate=False)
        except Exception as e:
            logger.warning(f"Error generating workspace distribution: {str(e)}")
        
        # Return DataFrame for further analysis if needed
        return dataflows_df
        
    except Exception as e:
        logger.error(f"Error in main execution: {str(e)}")
        raise
# ==================================.show(truncate=False)
    except Exception as e:
                logger.warning(f"Error generating principal type distribution: {str(e)}")
        
# CELL 11 - Execute Main Function
# ==================================
# Execute the main function
if __name__ == "__main__":
    dataflows_df = main()
# ==================================

StatementMeta(, 08f7bbd2-3093-4584-a86a-d1d10b5b1bdd, 4, Finished, Available, Finished)

+------------------------------------+----------------------------+----------------------------------------------------------+--------+------------------------+------------------------------------+--------------------------+
|dataflowId                          |name                        |description                                               |modelUrl|configuredBy            |workspaceId                         |extraction_timestamp      |
+------------------------------------+----------------------------+----------------------------------------------------------+--------+------------------------+------------------------------------+--------------------------+
|aa0029b7-4f7e-47a2-9310-3434c3c7d41f|Master Nova Notes           |This dataflow is connecting to the Master Nova Notes list.|NULL    |eepena@mdanderson.org   |9beed1e6-1368-4a64-9cb9-1a2a9630eb92|2025-07-17 15:23:39.724431|
|b0e50acc-edb0-4b21-b1e2-0562d9115888|EDI MOSAIQ Tx Start Dataflow|EDI MOSAIQ Tx Start Dataflow     

2025-07-17 15:23:49,625 - INFO - Delta table 'pbi_dataflows' already exists with correct schema
2025-07-17 15:23:49,626 - INFO - Starting merge operation for pbi_dataflows
2025-07-17 15:23:49,803 - INFO - Source DataFrame has 257 records
2025-07-17 15:23:53,156 - INFO - Target table has 241 records
2025-07-17 15:24:03,972 - INFO - Merge operation completed successfully
2025-07-17 15:24:03,973 - INFO - Optimizing Delta table 'pbi_dataflows'
2025-07-17 15:24:07,902 - INFO - Total rows in pbi_dataflows: 258
2025-07-17 15:24:08,113 - INFO - 
=== pbi_dataflows Summary Statistics ===


In [3]:
from pyspark.sql import SparkSession

# create Spark session
spark = SparkSession.builder.appName("Refresh SQL Endpoint Metadata").getOrCreate()

# refresh the specific table
spark.sql("REFRESH TABLE pbi_dataflows")
print("Metadata refresh triggered successfully.")


StatementMeta(, 08f7bbd2-3093-4584-a86a-d1d10b5b1bdd, 5, Finished, Available, Finished)

Metadata refresh triggered successfully.
