In [1]:
# DO NOT DELETE THIS CELL

# API Name: Workspaces - List Workspaces
# Command:  GET https://api.fabric.microsoft.com/v1/admin/workspaces
# Doc:      https://learn.microsoft.com/en-us/rest/api/fabric/admin/workspaces/list-workspaces

# Loads table: fabric_workspaces

StatementMeta(, 4fda317f-525a-48f8-9434-77bfdb8ced77, 3, Finished, Available, Finished)

In [2]:
# CELL 1 - Title and Introduction
# ==================================
# Microsoft Fabric Workspaces to Delta Lake - PySpark Notebook
# This notebook retrieves Microsoft Fabric workspaces and loads them into a Delta Lake table
# with optimization for analytics workloads
# ==================================


# CELL 2 - Import Libraries
# ==================================
# Import required libraries
import requests
import json
import pandas as pd
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, current_timestamp, lit, from_json
from pyspark.sql.types import StructType, StructField, StringType, TimestampType
import logging
from typing import Dict, List, Optional
from delta.tables import DeltaTable
import random
import time
# ==================================


# CELL 3 - Configure Logging and Initialize Spark
# ==================================
# Configure logging
# This helps us track what's happening in our code and debug issues
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)

# Initialize Spark Session with Delta Lake configurations
# In Fabric notebooks, Spark is pre-configured with Delta support
spark = SparkSession.builder \
    .appName("FabricWorkspacesToDelta") \
    .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension") \
    .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog") \
    .getOrCreate()

# Set optimal configurations for Delta operations
spark.conf.set("spark.databricks.delta.optimizeWrite.enabled", "true")
spark.conf.set("spark.databricks.delta.autoCompact.enabled", "true")
# ==================================


# CELL 4 - Configuration Parameters
# ==================================
# Configuration Parameters
# These are the settings we'll use throughout the notebook
CONFIG = {
    "API_BASE_URL": "https://api.fabric.microsoft.com/v1",
    "WORKSPACES_ENDPOINT": "/admin/workspaces",  # Endpoint for listing workspaces
    "MAX_RETRIES": 5,  # Increased number of retries for handling rate limits
    "INITIAL_BACKOFF_SEC": 1,  # Initial backoff time in seconds
    "MAX_BACKOFF_SEC": 60,  # Maximum backoff time in seconds
    "BACKOFF_FACTOR": 2,  # Exponential backoff multiplier
    "JITTER_FACTOR": 0.1,  # Random jitter to add to backoff (as a fraction)
    "TIMEOUT": 30,  # API request timeout in seconds
    "WORKSPACES_TABLE_NAME": "fabric_workspaces",  # Name of the target Delta table
    "LAKEHOUSE_PATH": "Tables",  # Default Tables folder in Fabric Lakehouse
    "DEBUG_MODE": True  # Set to True to enable extra debugging output
}

# Do not use the "top" parameter in API calls as instructed
# ==================================


# CELL 5 - Authentication Function
# ==================================
def get_access_token():
    """
    Get Azure AD access token for Fabric API authentication.
    
    In a Fabric notebook, the token is automatically available through mssparkutils.
    This function retrieves the token that's needed to authenticate with the Fabric REST API.
    
    Returns:
        str: The access token
    
    Note:
        mssparkutils is a utility library provided by Microsoft Fabric
        that handles authentication automatically.
    """
    try:
        # In Fabric notebooks, we can get the token using mssparkutils
        from notebookutils import mssparkutils
        token_response = mssparkutils.credentials.getToken("https://api.fabric.microsoft.com")
        return token_response
    except Exception as e:
        logger.error(f"Failed to get access token: {str(e)}")
        raise
# ==================================


# CELL 6 - API Call Function
# ==================================
def call_fabric_api(endpoint: str, access_token: str, params: Optional[Dict] = None) -> Dict:
    """
    Make a REST API call to Microsoft Fabric with advanced rate limit handling.
    
    This function handles the HTTP request to the Fabric API, including:
    - Setting up authentication headers
    - Managing retries with intelligent backoff for rate limiting (429 errors)
    - Implementing jitter to avoid synchronized retries
    - Detailed error handling and logging
    
    Args:
        endpoint: The API endpoint path (e.g., "/admin/workspaces")
        access_token: The Azure AD access token
        params: Optional query parameters for the API call
    
    Returns:
        dict: The JSON response from the API
    
    Raises:
        requests.exceptions.RequestException: If the API call fails after all retries
    """
    url = f"{CONFIG['API_BASE_URL']}{endpoint}"
    headers = {
        "Authorization": f"Bearer {access_token}",
        "Content-Type": "application/json"
    }
    
    # Initialize backoff time
    backoff_time = CONFIG['INITIAL_BACKOFF_SEC']
    
    # Retry logic with intelligent backoff
    for attempt in range(CONFIG['MAX_RETRIES']):
        try:
            # Log the full URL with parameters for debugging
            logger.info(f"Making API call to: {url} with params: {params} (Attempt {attempt + 1})")
            
            response = requests.get(
                url,
                headers=headers,
                params=params,
                timeout=CONFIG['TIMEOUT']
            )
            
            # Log the full response for debugging
            logger.info(f"Response status: {response.status_code}")
            
            # Rate limit handling (429 Too Many Requests)
            if response.status_code == 429:
                # Get retry-after header if available, otherwise use our backoff
                retry_after = response.headers.get('Retry-After')
                
                if retry_after and retry_after.isdigit():
                    # If server specified a wait time, use it
                    wait_time = int(retry_after)
                else:
                    # Calculate wait time with exponential backoff and jitter
                    jitter = random.uniform(0, CONFIG['JITTER_FACTOR'] * backoff_time)
                    wait_time = backoff_time + jitter
                    # Update backoff for next attempt
                    backoff_time = min(backoff_time * CONFIG['BACKOFF_FACTOR'], CONFIG['MAX_BACKOFF_SEC'])
                
                logger.warning(f"Rate limit exceeded (429). Waiting {wait_time:.2f} seconds before retry.")
                time.sleep(wait_time)
                continue  # Skip to next retry attempt without raising exception
            
            # Log the response for debugging in case of errors
            if response.status_code >= 400:
                logger.error(f"API error: Status {response.status_code}, Response: {response.text}")
                logger.error(f"Request URL: {response.request.url}")
                logger.error(f"Request headers: {response.request.headers}")
            
            # For all other status codes, use raise_for_status to handle
            response.raise_for_status()
            
            # If we get here, the request was successful
            # Reset backoff for next API call (not next attempt)
            backoff_time = CONFIG['INITIAL_BACKOFF_SEC']
            
            # Log a sample of the response content for debugging
            try:
                response_json = response.json()
                if "value" in response_json and isinstance(response_json["value"], list):
                    logger.info(f"Response contains {len(response_json['value'])} items in 'value' array")
                if "continuationToken" in response_json:
                    logger.info(f"Response contains continuationToken: {response_json['continuationToken']}")
                return response_json
            except json.JSONDecodeError as e:
                logger.error(f"Failed to parse response as JSON: {str(e)}")
                logger.error(f"Response content: {response.text[:1000]}")  # Log first 1000 chars of response
                raise
            
        except requests.exceptions.RequestException as e:
            last_attempt = attempt == CONFIG['MAX_RETRIES'] - 1
            
            # Special handling for non-429 errors
            if not (hasattr(e, 'response') and e.response is not None and e.response.status_code == 429):
                logger.warning(f"API call failed (Attempt {attempt + 1}): {str(e)}")
                
                if last_attempt:
                    logger.error(f"All retry attempts failed for endpoint: {endpoint}")
                    logger.error(f"Final error: {str(e)}")
                    raise
                
                # Calculate wait time with exponential backoff and jitter
                jitter = random.uniform(0, CONFIG['JITTER_FACTOR'] * backoff_time)
                wait_time = backoff_time + jitter
                # Update backoff for next attempt
                backoff_time = min(backoff_time * CONFIG['BACKOFF_FACTOR'], CONFIG['MAX_BACKOFF_SEC'])
                
                logger.info(f"Waiting {wait_time:.2f} seconds before retry.")
                time.sleep(wait_time)
# ==================================


# CELL 7 - Get Workspaces Function (CORRECTED)
# ==================================
def get_workspaces(access_token: str) -> List[Dict]:
    """
    Retrieve all workspaces from the Fabric API, handling pagination.
    
    This function makes requests to the List Workspaces API endpoint and
    handles pagination using the continuationToken to retrieve all workspaces.
    
    Args:
        access_token: The Azure AD access token
    
    Returns:
        list: A list of all workspace objects
    """
    all_workspaces = []
    continuation_token = None
    page_count = 0
    
    while True:
        page_count += 1
        
        # For pagination, we need to construct the URL manually because the Fabric API
        # has specific requirements for how the continuation token is formatted
        if continuation_token:
            # The continuation token must be passed in a specific way for the Fabric API
            # It needs to be in the URL but not URL-encoded again
            url = f"{CONFIG['API_BASE_URL']}{CONFIG['WORKSPACES_ENDPOINT']}?continuationToken={continuation_token}"
            
            if CONFIG['DEBUG_MODE']:
                logger.info(f"Page {page_count}: Making request with continuation token to URL: {url}")
            
            # Make direct API call with proper headers and retry logic
            headers = {
                "Authorization": f"Bearer {access_token}",
                "Content-Type": "application/json"
            }
            
            # Use the same retry logic as in call_fabric_api
            backoff_time = CONFIG['INITIAL_BACKOFF_SEC']
            
            for attempt in range(CONFIG['MAX_RETRIES']):
                try:
                    logger.info(f"Making API call to: {url} (Attempt {attempt + 1})")
                    
                    response = requests.get(
                        url,
                        headers=headers,
                        timeout=CONFIG['TIMEOUT']
                    )
                    
                    logger.info(f"Response status: {response.status_code}")
                    
                    # Handle rate limiting
                    if response.status_code == 429:
                        retry_after = response.headers.get('Retry-After')
                        if retry_after and retry_after.isdigit():
                            wait_time = int(retry_after)
                        else:
                            jitter = random.uniform(0, CONFIG['JITTER_FACTOR'] * backoff_time)
                            wait_time = backoff_time + jitter
                            backoff_time = min(backoff_time * CONFIG['BACKOFF_FACTOR'], CONFIG['MAX_BACKOFF_SEC'])
                        
                        logger.warning(f"Rate limit exceeded (429). Waiting {wait_time:.2f} seconds before retry.")
                        time.sleep(wait_time)
                        continue
                    
                    # Log errors
                    if response.status_code >= 400:
                        logger.error(f"API error: Status {response.status_code}, Response: {response.text}")
                        logger.error(f"Request URL: {response.request.url}")
                    
                    response.raise_for_status()
                    response_data = response.json()
                    break  # Success, exit retry loop
                    
                except requests.exceptions.RequestException as e:
                    if attempt == CONFIG['MAX_RETRIES'] - 1:
                        logger.error(f"All retry attempts failed for page {page_count}")
                        raise
                    
                    jitter = random.uniform(0, CONFIG['JITTER_FACTOR'] * backoff_time)
                    wait_time = backoff_time + jitter
                    backoff_time = min(backoff_time * CONFIG['BACKOFF_FACTOR'], CONFIG['MAX_BACKOFF_SEC'])
                    
                    logger.warning(f"API call failed (Attempt {attempt + 1}): {str(e)}")
                    logger.info(f"Waiting {wait_time:.2f} seconds before retry.")
                    time.sleep(wait_time)
        else:
            # First page - use the standard call_fabric_api function
            if CONFIG['DEBUG_MODE']:
                logger.info(f"Page {page_count}: Making initial request")
            
            try:
                response_data = call_fabric_api(CONFIG['WORKSPACES_ENDPOINT'], access_token)
            except requests.exceptions.RequestException as e:
                logger.error(f"API call failed on page {page_count}: {str(e)}")
                raise
        
        # Log the response structure for debugging
        if CONFIG['DEBUG_MODE']:
            logger.info(f"Response keys: {list(response_data.keys())}")
        
        # Extract workspaces from the response
        # Based on the log, the API returns "workspaces" not "value"
        workspaces = response_data.get("workspaces", [])
        
        if workspaces:
            all_workspaces.extend(workspaces)
            logger.info(f"Retrieved {len(workspaces)} workspaces on page {page_count}. Running total: {len(all_workspaces)}")
            
            # Log first workspace for debugging
            if CONFIG['DEBUG_MODE'] and workspaces:
                logger.info(f"Sample workspace: {json.dumps(workspaces[0], indent=2)}")
        else:
            logger.warning(f"No workspaces found on page {page_count}")
        
        # Check if there are more pages
        continuation_token = response_data.get("continuationToken")
        
        if continuation_token:
            logger.info(f"Found continuation token: {continuation_token}")
        else:
            logger.info("No continuation token found - this is the last page")
            break
    
    logger.info(f"Finished retrieving all workspaces. Total count: {len(all_workspaces)}")
    return all_workspaces
# ==================================


# CELL 8 - Create Enhanced DataFrame Function
# ==================================
def create_enhanced_workspaces_dataframe(workspaces_data: List[Dict]) -> "DataFrame":
    """
    Convert the workspaces data into an enhanced PySpark DataFrame for Delta Lake.
    
    This function:
    - Creates a structured DataFrame with the workspace data
    - Extracts only the required fields (capacityId, id, name, state, type)
    - Adds metadata columns for tracking
    
    Args:
        workspaces_data: List of workspace dictionaries from the API
    
    Returns:
        DataFrame: An enhanced PySpark DataFrame ready for Delta Lake
    """
    # Extract only the fields we need from each workspace
    simplified_workspaces = []
    
    for workspace in workspaces_data:
        # Based on the log output, the API returns these exact field names:
        # id, name, state, type, capacityId
        simplified_workspace = {
            "capacityId": workspace.get("capacityId"),
            "id": workspace.get("id"),
            "name": workspace.get("name"),  # The API returns "name" not "displayName"
            "state": workspace.get("state"),
            "type": workspace.get("type")
        }
        simplified_workspaces.append(simplified_workspace)
    
    # Define the schema with the specific fields we need
    schema = StructType([
        StructField("capacityId", StringType(), True),  # True = nullable
        StructField("id", StringType(), False),         # False = not nullable
        StructField("name", StringType(), True),
        StructField("state", StringType(), True),
        StructField("type", StringType(), True),
        StructField("extraction_timestamp", TimestampType(), False)
    ])
    
    # Convert the data to a DataFrame
    if not simplified_workspaces:
        logger.warning("No workspaces found. Creating empty DataFrame.")
        # Create an empty DataFrame with the schema
        empty_rdd = spark.sparkContext.emptyRDD()
        enhanced_df = spark.createDataFrame(empty_rdd, schema)
        return enhanced_df
    
    # Convert to pandas DataFrame first
    pandas_df = pd.DataFrame(simplified_workspaces)
    
    # Create the initial Spark DataFrame
    # We don't include extraction_timestamp here as we'll add it next
    required_columns = ["capacityId", "id", "name", "state", "type"]
    
    # Ensure all columns exist in the pandas DataFrame
    for col_name in required_columns:
        if col_name not in pandas_df.columns:
            pandas_df[col_name] = None
    
    spark_df = spark.createDataFrame(pandas_df[required_columns])
    
    # Add metadata column for tracking when this data was extracted
    enhanced_df = spark_df.withColumn("extraction_timestamp", current_timestamp())
    
    return enhanced_df
# ==================================


# CELL 9 - Delta Lake Operations Functions
# ==================================
def ensure_delta_table_exists(table_name: str, df_schema):
    """
    Ensure the Delta table exists, creating it if necessary.
    
    Args:
        table_name: Name of the Delta table
        df_schema: Schema of the DataFrame
    """
    try:
        # Check if table exists
        spark.sql(f"DESCRIBE TABLE {table_name}")
        logger.info(f"Delta table '{table_name}' already exists")
    except Exception:
        # Table doesn't exist, create it
        logger.info(f"Creating Delta table '{table_name}'")
        
        # Create an empty DataFrame with the schema
        empty_df = spark.createDataFrame([], df_schema)
        
        # Create the Delta table (without partitioning)
        empty_df.write \
            .mode("overwrite") \
            .option("overwriteSchema", "true") \
            .saveAsTable(table_name)
        
        logger.info(f"Delta table '{table_name}' created successfully")


def merge_data_to_delta(source_df, table_name: str):
    """
    Merge new workspace data into the Delta table using MERGE operation.
    
    This function performs an upsert operation:
    - Updates existing records if workspace ID matches
    - Inserts new records if workspace ID doesn't exist
    
    Args:
        source_df: DataFrame with new data
        table_name: Name of the target Delta table
    """
    logger.info(f"Starting merge operation for {table_name}")
    
    # Create a temporary view for the merge operation
    source_df.createOrReplaceTempView("workspace_updates")
    
    # If the table is empty, just insert all records
    if spark.table(table_name).count() == 0:
        logger.info(f"Table {table_name} is empty. Inserting all records.")
        source_df.write.mode("append").saveAsTable(table_name)
        return
    
    # Perform the merge operation
    merge_query = f"""
    MERGE INTO {table_name} AS target
    USING workspace_updates AS source
    ON target.id = source.id
    WHEN MATCHED THEN
        UPDATE SET 
            target.capacityId = source.capacityId,
            target.name = source.name,
            target.state = source.state,
            target.type = source.type,
            target.extraction_timestamp = source.extraction_timestamp
    WHEN NOT MATCHED THEN
        INSERT *
    """
    
    spark.sql(merge_query)
    logger.info("Merge operation completed successfully")


def optimize_delta_table(table_name: str):
    """
    Optimize the Delta table for better query performance.
    
    This function:
    - Updates table statistics for query optimization
    - Uses a more compatible method for Microsoft Fabric
    
    Args:
        table_name: Name of the Delta table to optimize
    """
    logger.info(f"Optimizing Delta table '{table_name}'")
    
    try:
        # Update table statistics for better query planning
        spark.sql(f"ANALYZE TABLE {table_name} COMPUTE STATISTICS")
        logger.info("Table statistics updated successfully")
        
        # Note: In Microsoft Fabric, Delta table optimization may be handled automatically
        # or through different commands than traditional Delta Lake
        # The standard OPTIMIZE and ZORDER commands might not be available
        
        # Alternative approach: Use Delta table properties to hint at optimization
        delta_table = DeltaTable.forName(spark, table_name)
        delta_table_detail = spark.sql(f"DESCRIBE DETAIL {table_name}")
        
        logger.info("Delta table optimization completed via statistics computation")
        logger.info("Note: Microsoft Fabric may automatically optimize Delta tables")
    except Exception as e:
        logger.warning(f"Table optimization step encountered an issue: {str(e)}")
        logger.info("Continuing with process - optimization is not critical for functionality")
# ==================================


# CELL 10 - Main Execution Function
# ==================================
def main():
    """
    Main execution function that orchestrates the entire process.
    
    This function:
    1. Gets the authentication token
    2. Retrieves all workspaces from the API
    3. Creates an enhanced PySpark DataFrame with the workspace data
    4. Loads data into a Delta Lake table
    5. Optimizes the table for analytics
    """
    try:
        logger.info("Starting Fabric Workspaces to Delta Lake process")
        
        # Step 1: Get authentication token
        logger.info("Getting access token...")
        access_token = get_access_token()
        logger.info("Successfully obtained access token")
        
        # Step 2: Retrieve all workspaces
        logger.info("Retrieving workspaces from Fabric API...")
        workspaces_data = get_workspaces(access_token)
        
        if not workspaces_data:
            logger.warning("No workspaces found. Please check your permissions and API access.")
            # Create empty dataframe with schema for consistent table structure
            empty_schema = StructType([
                StructField("capacityId", StringType(), True),
                StructField("id", StringType(), False),
                StructField("name", StringType(), True),
                StructField("state", StringType(), True),
                StructField("type", StringType(), True),
                StructField("extraction_timestamp", TimestampType(), False)
            ])
            workspaces_df = spark.createDataFrame([], empty_schema)
        else:
            # Step 3: Create enhanced DataFrame
            logger.info(f"Creating DataFrame for {len(workspaces_data)} workspaces...")
            workspaces_df = create_enhanced_workspaces_dataframe(workspaces_data)
        
        # Show sample data
        logger.info("Sample of enhanced workspaces data:")
        workspaces_df.show(5, truncate=False)
        
        # Step 4: Prepare Delta table
        table_name = CONFIG["WORKSPACES_TABLE_NAME"]
        ensure_delta_table_exists(table_name, workspaces_df.schema)
        
        # Step 5: Merge data into Delta table (if we have data)
        if workspaces_data:
            merge_data_to_delta(workspaces_df, table_name)
            
            # Step 6: Optimize the Delta table
            optimize_delta_table(table_name)
        
        # Step 7: Display final statistics
        logger.info("Loading completed successfully!")
        
        # Show table information
        spark.sql(f"DESCRIBE DETAIL {table_name}").show(truncate=False)
        
        # Show row count
        row_count = spark.table(table_name).count()
        logger.info(f"Total rows in {table_name}: {row_count}")
        
        # Show summary statistics
        summary_stats = spark.sql(f"""
            SELECT 
                COUNT(*) as total_workspaces,
                COUNT(DISTINCT capacityId) as unique_capacities,
                COUNT(DISTINCT state) as workspace_states,
                COUNT(DISTINCT type) as workspace_types,
                MAX(extraction_timestamp) as last_updated
            FROM {table_name}
        """)
        
        logger.info("Summary statistics:")
        summary_stats.show(truncate=False)
        
        # Optional: Show distribution by state
        state_distribution = spark.sql(f"""
            SELECT 
                state,
                COUNT(*) as count
            FROM {table_name}
            GROUP BY state
            ORDER BY count DESC
        """)
        
        logger.info("Workspace distribution by state:")
        state_distribution.show(truncate=False)
        
        return workspaces_df
        
    except Exception as e:
        logger.error(f"Error in main execution: {str(e)}")
        raise
# ==================================


# CELL 11 - Execute Main Function
# ==================================
# Execute the main function
if __name__ == "__main__":
    workspaces_df = main()
# ==================================


# CELL 12 - Maintenance and Best Practices
# ==================================
"""
MAINTENANCE AND BEST PRACTICES:

1. SCHEDULED UPDATES:
   - Schedule this notebook to run on a regular basis (daily/weekly)
   - Configure it as part of a Fabric pipeline
   - Consider capturing historical snapshots by using a timestamp partition

2. DELTA LAKE MAINTENANCE:
   - Run VACUUM periodically to clean old files (if supported in your Fabric environment):
     spark.sql(f"VACUUM {CONFIG['WORKSPACES_TABLE_NAME']} RETAIN 168 HOURS")
   - Monitor history retention and storage usage
   - Review table properties and statistics

3. MONITORING AND ALERTING:
   - Set up alerts for workspace state changes
   - Monitor for capacity changes
   - Track workspace count by type for governance

4. POWER BI INTEGRATION:
   - Create dashboards showing workspace distribution by capacity
   - Monitor workspace states
   - Visualize workspace types across the organization

5. DATA SECURITY:
   - Implement appropriate access controls on the Delta table
   - Consider who should have access to workspace metadata
   - Document security implications of workspace settings

6. PERFORMANCE OPTIMIZATION:
   - Consider partitioning strategies if data grows significantly
   - Create joined views with other Fabric metadata tables for broader analytics
   - Use caching for frequently accessed data

Example analysis query - Find distribution of workspaces by type and state:
```sql
SELECT 
  type,
  state,
  COUNT(*) as workspace_count
FROM fabric_workspaces
GROUP BY type, state
ORDER BY type, state
```

7. ERROR RECOVERY:
   - Use Delta time travel for recovery (if supported in your Fabric environment):
     spark.read.option("versionAsOf", 1).table(CONFIG['WORKSPACES_TABLE_NAME'])
   - Implement logging for all workspace changes
   - Create snapshots before major tenant changes
"""
# ==================================

StatementMeta(, 4fda317f-525a-48f8-9434-77bfdb8ced77, 4, Finished, Available, Finished)

2025-07-16 15:45:27,899 - INFO - Starting Fabric Workspaces to Delta Lake process
2025-07-16 15:45:27,899 - INFO - Getting access token...
2025-07-16 15:45:28,695 - INFO - Successfully obtained access token
2025-07-16 15:45:28,696 - INFO - Retrieving workspaces from Fabric API...
2025-07-16 15:45:28,696 - INFO - Page 1: Making initial request
2025-07-16 15:45:28,697 - INFO - Making API call to: https://api.fabric.microsoft.com/v1/admin/workspaces with params: None (Attempt 1)
2025-07-16 15:45:30,014 - INFO - Response status: 200
2025-07-16 15:45:30,024 - INFO - Response contains continuationToken: MTAwMDAsMTAwMDA%3D
2025-07-16 15:45:30,027 - INFO - Response keys: ['workspaces', 'continuationUri', 'continuationToken']
2025-07-16 15:45:30,027 - INFO - Retrieved 10000 workspaces on page 1. Running total: 10000
2025-07-16 15:45:30,028 - INFO - Sample workspace: {
  "id": "47298625-cc8c-4967-82bf-2c97da0254af",
  "name": "IAI - BI Delivery Team",
  "state": "Active",
  "type": "Workspace",


+------------------------------------+------------------------------------+---------------------------+------+---------+--------------------------+
|capacityId                          |id                                  |name                       |state |type     |extraction_timestamp      |
+------------------------------------+------------------------------------+---------------------------+------+---------+--------------------------+
|665D4010-E0FD-4821-B5B0-6C2760C1D498|47298625-cc8c-4967-82bf-2c97da0254af|IAI - BI Delivery Team     |Active|Workspace|2025-07-16 15:45:37.232706|
|6BA0A957-48B9-4AD6-B422-6D779CC7DFF3|2022d60a-0ea0-460a-b794-8ac87ef3a916|IAI Management             |Active|Workspace|2025-07-16 15:45:37.232706|
|646712A9-B160-4E8C-922D-A32ACB6EA7AA|49ec4821-c19f-4a69-ab00-53ed07fdc830|EDI Analytics              |Active|Workspace|2025-07-16 15:45:37.232706|
|6F2BAF05-9C63-4402-BE41-C719011112F2|aad173a4-46ac-4b68-8718-e1ebc008f238|Github 8/26/2024 1:59:22 PM|Active|Wo

'\nMAINTENANCE AND BEST PRACTICES:\n\n1. SCHEDULED UPDATES:\n   - Schedule this notebook to run on a regular basis (daily/weekly)\n   - Configure it as part of a Fabric pipeline\n   - Consider capturing historical snapshots by using a timestamp partition\n\n2. DELTA LAKE MAINTENANCE:\n   - Run VACUUM periodically to clean old files (if supported in your Fabric environment):\n     spark.sql(f"VACUUM {CONFIG[\'WORKSPACES_TABLE_NAME\']} RETAIN 168 HOURS")\n   - Monitor history retention and storage usage\n   - Review table properties and statistics\n\n3. MONITORING AND ALERTING:\n   - Set up alerts for workspace state changes\n   - Monitor for capacity changes\n   - Track workspace count by type for governance\n\n4. POWER BI INTEGRATION:\n   - Create dashboards showing workspace distribution by capacity\n   - Monitor workspace states\n   - Visualize workspace types across the organization\n\n5. DATA SECURITY:\n   - Implement appropriate access controls on the Delta table\n   - Consider wh

In [3]:
from pyspark.sql import SparkSession

# create Spark session
spark = SparkSession.builder.appName("Refresh SQL Endpoint Metadata").getOrCreate()

# refresh the specific table
spark.sql("REFRESH TABLE fabric_workspaces")
print("Metadata refresh triggered successfully.")


StatementMeta(, 4fda317f-525a-48f8-9434-77bfdb8ced77, 5, Finished, Available, Finished)

Metadata refresh triggered successfully.
