In [1]:
# DO NOT DELETE THIS CELL

# API Name: Items - List Items
# Command:  GET https://api.fabric.microsoft.com/v1/admin/items
# Doc:      https://learn.microsoft.com/en-us/rest/api/fabric/admin/items/list-items

# Loads table: fabric_items
# Loads table: fabric_items_creators

StatementMeta(, f516eaea-ae7d-4600-9404-46789e43c376, 3, Finished, Available, Finished)

In [2]:
# CELL 1 - Title and Introduction
# ==================================
# Microsoft Fabric Items to Delta Lake - PySpark Notebook
# This notebook retrieves Microsoft Fabric items using the List Items API and loads them
# into two normalized Delta Lake tables:
# 1. fabric_items - Core item information
# 2. fabric_items_creators - Normalized creator/principal information
# This design eliminates data duplication and supports flexible analytics queries
# ==================================


# CELL 2 - Import Libraries
# ==================================
# Import required libraries
import requests
import json
import pandas as pd
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, current_timestamp, lit, from_json, when
from pyspark.sql.types import (
    StructType, StructField, StringType, TimestampType, 
    ArrayType, BooleanType
)
import logging
from typing import Dict, List, Optional, Tuple
from delta.tables import DeltaTable
import random
import time
from datetime import datetime
# ==================================


# CELL 3 - Configure Logging and Initialize Spark
# ==================================
# Configure logging
# This helps us track what's happening in our code and debug issues
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)

# Initialize Spark Session with Delta Lake configurations
# In Fabric notebooks, Spark is pre-configured with Delta support
spark = SparkSession.builder \
    .appName("FabricItemsToDelta") \
    .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension") \
    .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog") \
    .getOrCreate()

# Set optimal configurations for Delta operations
spark.conf.set("spark.databricks.delta.optimizeWrite.enabled", "true")
spark.conf.set("spark.databricks.delta.autoCompact.enabled", "true")

# Configure timestamp handling for Parquet compatibility
# This fixes issues with ancient timestamps that may exist in the API data
spark.conf.set("spark.sql.parquet.datetimeRebaseModeInWrite", "CORRECTED")
spark.conf.set("spark.sql.parquet.datetimeRebaseModeInRead", "CORRECTED")
spark.conf.set("spark.sql.parquet.int96RebaseModeInWrite", "CORRECTED")
spark.conf.set("spark.sql.parquet.int96RebaseModeInRead", "CORRECTED")
# ==================================


# CELL 4 - Configuration Parameters
# ==================================
# Configuration Parameters
# These are the settings we'll use throughout the notebook
CONFIG = {
    "API_BASE_URL": "https://api.fabric.microsoft.com/v1",
    "ITEMS_ENDPOINT": "/admin/items",  # Endpoint for listing all Fabric items
    "MAX_RETRIES": 5,  # Number of retries for handling rate limits
    "INITIAL_BACKOFF_SEC": 1,  # Initial backoff time in seconds
    "MAX_BACKOFF_SEC": 60,  # Maximum backoff time in seconds
    "BACKOFF_FACTOR": 2,  # Exponential backoff multiplier
    "JITTER_FACTOR": 0.1,  # Random jitter to add to backoff (as a fraction)
    "TIMEOUT": 30,  # API request timeout in seconds
    "ITEMS_TABLE_NAME": "fabric_items",  # Name of the main items Delta table
    "CREATORS_TABLE_NAME": "fabric_items_creators",  # Name of the creators Delta table
    "LAKEHOUSE_PATH": "Tables",  # Default Tables folder in Fabric Lakehouse
    "DEBUG_MODE": True,  # Set to True to enable extra debugging output
    "RATE_LIMIT_MAX_REQUESTS": 200,  # API limit: 200 requests per hour
    "PAGE_SIZE": 10000  # Maximum records per API request (API limit)
}

# Note: The API has a rate limit of 200 requests per hour
# With 10,000 records per request, we can theoretically retrieve up to 2M items per hour
logger.info(f"API Rate Limit: {CONFIG['RATE_LIMIT_MAX_REQUESTS']} requests/hour")
logger.info(f"Max records per request: {CONFIG['PAGE_SIZE']}")
# ==================================


# CELL 5 - Authentication Function
# ==================================
def get_access_token():
    """
    Get Azure AD access token for Fabric API authentication.
    
    In a Fabric notebook, the token is automatically available through mssparkutils.
    This function retrieves the token that's needed to authenticate with the Fabric REST API.
    
    Returns:
        str: The access token
    
    Note:
        mssparkutils is a utility library provided by Microsoft Fabric
        that handles authentication automatically.
        
        The caller must be a Fabric administrator or authenticate using a service principal.
        Required Delegated Scopes: Tenant.Read.All or Tenant.ReadWrite.All
    """
    try:
        # In Fabric notebooks, we can get the token using mssparkutils
        from notebookutils import mssparkutils
        token_response = mssparkutils.credentials.getToken("https://api.fabric.microsoft.com")
        logger.info("Successfully obtained access token for Fabric API")
        return token_response
    except Exception as e:
        logger.error(f"Failed to get access token: {str(e)}")
        logger.error("Make sure you have Fabric administrator permissions or are using a service principal")
        raise
# ==================================


# CELL 6 - API Call Function with Rate Limiting
# ==================================
def call_fabric_api(endpoint: str, access_token: str, params: Optional[Dict] = None) -> Dict:
    """
    Make a REST API call to Microsoft Fabric with advanced rate limit handling.
    
    This function handles the HTTP request to the Fabric API, including:
    - Setting up authentication headers
    - Managing retries with intelligent backoff for rate limiting (429 errors)
    - Implementing jitter to avoid synchronized retries
    - Detailed error handling and logging
    - Special handling for Fabric's 200 requests/hour limit
    
    Args:
        endpoint: The API endpoint path (e.g., "/admin/items")
        access_token: The Azure AD access token
        params: Optional query parameters for the API call
    
    Returns:
        dict: The JSON response from the API
    
    Raises:
        requests.exceptions.RequestException: If the API call fails after all retries
    """
    url = f"{CONFIG['API_BASE_URL']}{endpoint}"
    headers = {
        "Authorization": f"Bearer {access_token}",
        "Content-Type": "application/json"
    }
    
    # Initialize backoff time
    backoff_time = CONFIG['INITIAL_BACKOFF_SEC']
    
    # Retry logic with intelligent backoff
    for attempt in range(CONFIG['MAX_RETRIES']):
        try:
            # Log the full URL with parameters for debugging
            logger.info(f"Making API call to: {url} with params: {params} (Attempt {attempt + 1})")
            
            response = requests.get(
                url,
                headers=headers,
                params=params,
                timeout=CONFIG['TIMEOUT']
            )
            
            # Log the full response for debugging
            logger.info(f"Response status: {response.status_code}")
            
            # Rate limit handling (429 Too Many Requests)
            if response.status_code == 429:
                # Get retry-after header if available, otherwise use our backoff
                retry_after = response.headers.get('Retry-After')
                
                if retry_after and retry_after.isdigit():
                    # If server specified a wait time, use it
                    wait_time = int(retry_after)
                    logger.warning(f"Rate limit exceeded (429). Server requested wait time: {wait_time} seconds")
                else:
                    # Calculate wait time with exponential backoff and jitter
                    jitter = random.uniform(0, CONFIG['JITTER_FACTOR'] * backoff_time)
                    wait_time = backoff_time + jitter
                    # Update backoff for next attempt
                    backoff_time = min(backoff_time * CONFIG['BACKOFF_FACTOR'], CONFIG['MAX_BACKOFF_SEC'])
                    logger.warning(f"Rate limit exceeded (429). Calculated wait time: {wait_time:.2f} seconds")
                
                logger.warning(f"Note: Fabric API has a limit of {CONFIG['RATE_LIMIT_MAX_REQUESTS']} requests per hour")
                time.sleep(wait_time)
                continue  # Skip to next retry attempt without raising exception
            
            # Log the response for debugging in case of errors
            if response.status_code >= 400:
                logger.error(f"API error: Status {response.status_code}, Response: {response.text}")
                logger.error(f"Request URL: {response.request.url}")
                logger.error(f"Request headers: {dict(response.request.headers)}")
            
            # For all other status codes, use raise_for_status to handle
            response.raise_for_status()
            
            # If we get here, the request was successful
            # Reset backoff for next API call (not next attempt)
            backoff_time = CONFIG['INITIAL_BACKOFF_SEC']
            
            # Log a sample of the response content for debugging
            try:
                response_json = response.json()
                if "itemEntities" in response_json and isinstance(response_json["itemEntities"], list):
                    logger.info(f"Response contains {len(response_json['itemEntities'])} items in 'itemEntities' array")
                if "continuationToken" in response_json:
                    logger.info(f"Response contains continuationToken: {response_json['continuationToken']}")
                return response_json
            except json.JSONDecodeError as e:
                logger.error(f"Failed to parse response as JSON: {str(e)}")
                logger.error(f"Response content: {response.text[:1000]}")  # Log first 1000 chars of response
                raise
            
        except requests.exceptions.RequestException as e:
            last_attempt = attempt == CONFIG['MAX_RETRIES'] - 1
            
            # Special handling for non-429 errors
            if not (hasattr(e, 'response') and e.response is not None and e.response.status_code == 429):
                logger.warning(f"API call failed (Attempt {attempt + 1}): {str(e)}")
                
                if last_attempt:
                    logger.error(f"All retry attempts failed for endpoint: {endpoint}")
                    logger.error(f"Final error: {str(e)}")
                    raise
                
                # Calculate wait time with exponential backoff and jitter
                jitter = random.uniform(0, CONFIG['JITTER_FACTOR'] * backoff_time)
                wait_time = backoff_time + jitter
                # Update backoff for next attempt
                backoff_time = min(backoff_time * CONFIG['BACKOFF_FACTOR'], CONFIG['MAX_BACKOFF_SEC'])
                
                logger.info(f"Waiting {wait_time:.2f} seconds before retry.")
                time.sleep(wait_time)
# ==================================


# CELL 7 - Get All Items Function with Pagination
# ==================================
def get_all_items(access_token: str) -> List[Dict]:
    """
    Retrieve all items from the Fabric API, handling pagination.
    
    This function makes requests to the List Items API endpoint and
    handles pagination using the continuationToken to retrieve all items.
    The API returns up to 10,000 items per request and supports filtering by:
    - workspaceId, capacityId, state, type
    
    Args:
        access_token: The Azure AD access token
    
    Returns:
        list: A list of all item objects from itemEntities
    """
    all_items = []
    continuation_token = None
    page_count = 0
    total_items_processed = 0
    
    logger.info("Starting to retrieve all Fabric items...")
    logger.info("Items will be retrieved in this order: Fabric items, Datamarts, Reports, Dashboards, SemanticModels, Apps, Dataflows")
    
    while True:
        page_count += 1
        
        # For pagination, we need to construct the URL manually because the Fabric API
        # has specific requirements for how the continuation token is formatted
        if continuation_token:
            # The continuation token must be passed in a specific way for the Fabric API
            url = f"{CONFIG['API_BASE_URL']}{CONFIG['ITEMS_ENDPOINT']}?continuationToken={continuation_token}"
            
            if CONFIG['DEBUG_MODE']:
                logger.info(f"Page {page_count}: Making request with continuation token")
            
            # Make direct API call with proper headers and retry logic
            headers = {
                "Authorization": f"Bearer {access_token}",
                "Content-Type": "application/json"
            }
            
            # Use the same retry logic as in call_fabric_api
            backoff_time = CONFIG['INITIAL_BACKOFF_SEC']
            
            for attempt in range(CONFIG['MAX_RETRIES']):
                try:
                    logger.info(f"Making API call to: {url} (Attempt {attempt + 1})")
                    
                    response = requests.get(
                        url,
                        headers=headers,
                        timeout=CONFIG['TIMEOUT']
                    )
                    
                    logger.info(f"Response status: {response.status_code}")
                    
                    # Handle rate limiting
                    if response.status_code == 429:
                        retry_after = response.headers.get('Retry-After')
                        if retry_after and retry_after.isdigit():
                            wait_time = int(retry_after)
                        else:
                            jitter = random.uniform(0, CONFIG['JITTER_FACTOR'] * backoff_time)
                            wait_time = backoff_time + jitter
                            backoff_time = min(backoff_time * CONFIG['BACKOFF_FACTOR'], CONFIG['MAX_BACKOFF_SEC'])
                        
                        logger.warning(f"Rate limit exceeded (429). Waiting {wait_time:.2f} seconds before retry.")
                        time.sleep(wait_time)
                        continue
                    
                    # Log errors
                    if response.status_code >= 400:
                        logger.error(f"API error: Status {response.status_code}, Response: {response.text}")
                    
                    response.raise_for_status()
                    response_data = response.json()
                    break  # Success, exit retry loop
                    
                except requests.exceptions.RequestException as e:
                    if attempt == CONFIG['MAX_RETRIES'] - 1:
                        logger.error(f"All retry attempts failed for page {page_count}")
                        raise
                    
                    jitter = random.uniform(0, CONFIG['JITTER_FACTOR'] * backoff_time)
                    wait_time = backoff_time + jitter
                    backoff_time = min(backoff_time * CONFIG['BACKOFF_FACTOR'], CONFIG['MAX_BACKOFF_SEC'])
                    
                    logger.warning(f"API call failed (Attempt {attempt + 1}): {str(e)}")
                    logger.info(f"Waiting {wait_time:.2f} seconds before retry.")
                    time.sleep(wait_time)
        else:
            # First page - use the standard call_fabric_api function
            if CONFIG['DEBUG_MODE']:
                logger.info(f"Page {page_count}: Making initial request")
            
            try:
                response_data = call_fabric_api(CONFIG['ITEMS_ENDPOINT'], access_token)
            except requests.exceptions.RequestException as e:
                logger.error(f"API call failed on page {page_count}: {str(e)}")
                raise
        
        # Log the response structure for debugging
        if CONFIG['DEBUG_MODE']:
            logger.info(f"Response keys: {list(response_data.keys())}")
        
        # Extract items from the response
        # The API returns items in "itemEntities" array
        items = response_data.get("itemEntities", [])
        
        if items:
            all_items.extend(items)
            total_items_processed += len(items)
            logger.info(f"Retrieved {len(items)} items on page {page_count}. Running total: {total_items_processed}")
            
            # Log first item for debugging
            if CONFIG['DEBUG_MODE'] and items:
                logger.info(f"Sample item: {json.dumps(items[0], indent=2)}")
                
            # Log item type distribution for this page
            if CONFIG['DEBUG_MODE']:
                item_types = {}
                for item in items:
                    item_type = item.get('type', 'Unknown')
                    item_types[item_type] = item_types.get(item_type, 0) + 1
                logger.info(f"Page {page_count} item types: {item_types}")
        else:
            logger.warning(f"No items found on page {page_count}")
        
        # Check if there are more pages
        continuation_token = response_data.get("continuationToken")
        continuation_uri = response_data.get("continuationUri")
        
        if continuation_token:
            logger.info(f"Found continuation token, will retrieve next page")
            if CONFIG['DEBUG_MODE']:
                logger.info(f"Continuation URI: {continuation_uri}")
        else:
            logger.info("No continuation token found - this is the last page")
            break
    
    # Log final statistics
    logger.info(f"Finished retrieving all items. Total pages: {page_count}, Total items: {len(all_items)}")
    
    # Log overall item type distribution
    if all_items:
        item_types_summary = {}
        for item in all_items:
            item_type = item.get('type', 'Unknown')
            item_types_summary[item_type] = item_types_summary.get(item_type, 0) + 1
        
        logger.info("Final item type distribution:")
        for item_type, count in sorted(item_types_summary.items()):
            logger.info(f"  {item_type}: {count}")
    
    return all_items
# ==================================


# CELL 8 - Data Processing Functions
# ==================================
def extract_items_and_creators(items_data: List[Dict]) -> Tuple[List[Dict], List[Dict]]:
    """
    Extract and normalize items and creators from the API response.
    
    This function separates the nested creator information from items to avoid
    data duplication and creates normalized records for both entities.
    
    Args:
        items_data: List of item dictionaries from the API
    
    Returns:
        tuple: (simplified_items, unique_creators)
            - simplified_items: List of item records without nested creator details
            - unique_creators: List of unique creator/principal records
    """
    simplified_items = []
    creators_dict = {}  # Use dict to automatically handle duplicates by principal ID
    
    logger.info(f"Processing {len(items_data)} items to extract normalized data...")
    
    for item in items_data:
        # Extract core item information
        simplified_item = {
            "id": item.get("id"),
            "type": item.get("type"),
            "name": item.get("name"),
            "description": item.get("description"),
            "state": item.get("state"),
            "last_updated_date": item.get("lastUpdatedDate"),  # API uses this exact field name
            "workspace_id": item.get("workspaceId"),
            "capacity_id": item.get("capacityId"),
            "creator_principal_id": None  # Will be populated from creatorPrincipal
        }
        
        # Extract creator principal information
        creator_principal = item.get("creatorPrincipal")
        if creator_principal:
            principal_id = creator_principal.get("id")
            simplified_item["creator_principal_id"] = principal_id
            
            # Only add creator if we haven't seen this principal ID before
            if principal_id and principal_id not in creators_dict:
                creator_record = {
                    "principal_id": principal_id,
                    "display_name": creator_principal.get("displayName"),
                    "principal_type": creator_principal.get("type"),
                    "user_principal_name": None,
                    "aad_app_id": None,
                    "group_type": None,
                    "parent_principal_id": None
                }
                
                # Extract type-specific details
                principal_type = creator_principal.get("type")
                
                if principal_type == "User":
                    user_details = creator_principal.get("userDetails", {})
                    creator_record["user_principal_name"] = user_details.get("userPrincipalName")
                
                elif principal_type == "ServicePrincipal":
                    service_details = creator_principal.get("servicePrincipalDetails", {})
                    creator_record["aad_app_id"] = service_details.get("aadAppId")
                
                elif principal_type == "Group":
                    group_details = creator_principal.get("groupDetails", {})
                    creator_record["group_type"] = group_details.get("groupType")
                
                elif principal_type == "ServicePrincipalProfile":
                    profile_details = creator_principal.get("servicePrincipalProfileDetails", {})
                    parent_principal = profile_details.get("parentPrincipal", {})
                    creator_record["parent_principal_id"] = parent_principal.get("id")
                
                # Add to creators dictionary (this automatically handles duplicates)
                creators_dict[principal_id] = creator_record
        
        simplified_items.append(simplified_item)
    
    # Convert creators dict to list
    unique_creators = list(creators_dict.values())
    
    logger.info(f"Extracted {len(simplified_items)} items and {len(unique_creators)} unique creators")
    
    # Log creator type distribution
    if unique_creators:
        creator_types = {}
        for creator in unique_creators:
            creator_type = creator.get('principal_type', 'Unknown')
            creator_types[creator_type] = creator_types.get(creator_type, 0) + 1
        
        logger.info("Creator type distribution:")
        for creator_type, count in sorted(creator_types.items()):
            logger.info(f"  {creator_type}: {count}")
    
    return simplified_items, unique_creators


def create_items_dataframe(items_data: List[Dict]) -> "DataFrame":
    """
    Convert the items data into a PySpark DataFrame for Delta Lake.
    
    This function creates a structured DataFrame with the core item information,
    properly handling data types and adding metadata columns.
    
    Args:
        items_data: List of simplified item dictionaries
    
    Returns:
        DataFrame: A PySpark DataFrame ready for Delta Lake storage
    """
    # Define the schema for items table
    items_schema = StructType([
        StructField("id", StringType(), False),                    # Primary key
        StructField("type", StringType(), True),
        StructField("name", StringType(), True),
        StructField("description", StringType(), True),
        StructField("state", StringType(), True),
        StructField("last_updated_date", StringType(), True),      # Keep as string initially, will convert
        StructField("workspace_id", StringType(), True),
        StructField("capacity_id", StringType(), True),
        StructField("creator_principal_id", StringType(), True),   # Foreign key to creators table
        StructField("extraction_timestamp", TimestampType(), False)
    ])
    
    # Handle empty data
    if not items_data:
        logger.warning("No items found. Creating empty DataFrame.")
        empty_rdd = spark.sparkContext.emptyRDD()
        return spark.createDataFrame(empty_rdd, items_schema)
    
    # Convert to pandas DataFrame first for easier manipulation
    pandas_df = pd.DataFrame(items_data)
    
    # Ensure all required columns exist with proper default values
    required_columns = ["id", "type", "name", "description", "state", 
                       "last_updated_date", "workspace_id", "capacity_id", "creator_principal_id"]
    
    for col_name in required_columns:
        if col_name not in pandas_df.columns:
            pandas_df[col_name] = None
        # Convert any empty strings to None to avoid VOID type issues
        pandas_df[col_name] = pandas_df[col_name].replace('', None)
    
    # Create Spark DataFrame with explicit schema to avoid VOID type issues
    base_columns = ["id", "type", "name", "description", "state", 
                   "last_updated_date", "workspace_id", "capacity_id", "creator_principal_id"]
    
    spark_df = spark.createDataFrame(
        pandas_df[base_columns],
        schema=StructType([
            StructField("id", StringType(), False),
            StructField("type", StringType(), True),
            StructField("name", StringType(), True),
            StructField("description", StringType(), True),
            StructField("state", StringType(), True),
            StructField("last_updated_date", StringType(), True),
            StructField("workspace_id", StringType(), True),
            StructField("capacity_id", StringType(), True),
            StructField("creator_principal_id", StringType(), True)
        ])
    )
    
    # Add extraction timestamp
    enhanced_df = spark_df.withColumn("extraction_timestamp", current_timestamp())
    
    # Convert last_updated_date from string to timestamp if not null
    # Also handle potential invalid/ancient timestamps that cause Parquet write issues
    final_df = enhanced_df.withColumn(
        "last_updated_date",
        when(col("last_updated_date").isNotNull(), 
             # Only convert timestamps that are after 1900-01-01 to avoid Parquet issues
             when(col("last_updated_date") >= "1900-01-01T00:00:00.000Z",
                  col("last_updated_date").cast(TimestampType()))
             .otherwise(None))
        .otherwise(None)
    )
    
    return final_df


def create_creators_dataframe(creators_data: List[Dict]) -> "DataFrame":
    """
    Convert the creators data into a PySpark DataFrame for Delta Lake.
    
    This function creates a structured DataFrame with the creator/principal information,
    properly handling the different principal types and their specific details.
    
    Args:
        creators_data: List of creator/principal dictionaries
    
    Returns:
        DataFrame: A PySpark DataFrame ready for Delta Lake storage
    """
    # Define the schema for creators table with explicit types
    creators_schema = StructType([
        StructField("principal_id", StringType(), False),          # Primary key
        StructField("display_name", StringType(), True),
        StructField("principal_type", StringType(), True),
        StructField("user_principal_name", StringType(), True),    # For User type
        StructField("aad_app_id", StringType(), True),            # For ServicePrincipal type
        StructField("group_type", StringType(), True),            # For Group type
        StructField("parent_principal_id", StringType(), True),    # For ServicePrincipalProfile type
        StructField("extraction_timestamp", TimestampType(), False)
    ])
    
    # Handle empty data
    if not creators_data:
        logger.warning("No creators found. Creating empty DataFrame.")
        empty_rdd = spark.sparkContext.emptyRDD()
        return spark.createDataFrame(empty_rdd, creators_schema)
    
    # Convert to pandas DataFrame first for easier manipulation
    pandas_df = pd.DataFrame(creators_data)
    
    # Ensure all required columns exist with proper default values
    required_columns = ["principal_id", "display_name", "principal_type", 
                       "user_principal_name", "aad_app_id", "group_type", "parent_principal_id"]
    
    for col_name in required_columns:
        if col_name not in pandas_df.columns:
            pandas_df[col_name] = None
        # Convert any empty strings to None to avoid VOID type issues
        pandas_df[col_name] = pandas_df[col_name].replace('', None)
    
    # Create Spark DataFrame with explicit schema to avoid VOID type issues
    # First create with the required columns only (excluding extraction_timestamp)
    base_columns = ["principal_id", "display_name", "principal_type", 
                   "user_principal_name", "aad_app_id", "group_type", "parent_principal_id"]
    
    # Create the DataFrame with explicit schema
    spark_df = spark.createDataFrame(
        pandas_df[base_columns], 
        schema=StructType([
            StructField("principal_id", StringType(), False),
            StructField("display_name", StringType(), True),
            StructField("principal_type", StringType(), True),
            StructField("user_principal_name", StringType(), True),
            StructField("aad_app_id", StringType(), True),
            StructField("group_type", StringType(), True),
            StructField("parent_principal_id", StringType(), True)
        ])
    )
    
    # Add extraction timestamp
    enhanced_df = spark_df.withColumn("extraction_timestamp", current_timestamp())
    
    return enhanced_df
# ==================================


# CELL 10 - Delta Lake Operations and Schema Fix Functions
# ==================================
def ensure_delta_table_exists(table_name: str, df_schema):
    """
    Ensure the Delta table exists with the correct schema, creating or recreating it if necessary.
    
    Args:
        table_name: Name of the Delta table
        df_schema: Schema of the DataFrame
    """
    try:
        # Check if table exists and get its schema
        existing_table_schema = spark.table(table_name).schema
        logger.info(f"Delta table '{table_name}' already exists")
        
        # Check for VOID/NullType columns in existing schema
        has_void_columns = False
        for field in existing_table_schema.fields:
            if str(field.dataType) in ['VoidType', 'NullType']:
                logger.warning(f"Found VOID/NullType column '{field.name}' in existing table")
                has_void_columns = True
        
        if has_void_columns:
            logger.warning(f"Table '{table_name}' has VOID/NullType columns. Recreating with correct schema...")
            # Drop and recreate the table with correct schema
            spark.sql(f"DROP TABLE IF EXISTS {table_name}")
            logger.info(f"Dropped table '{table_name}' with invalid schema")
            
            # Create new table with correct schema
            empty_df = spark.createDataFrame([], df_schema)
            empty_df.write \
                .mode("overwrite") \
                .option("overwriteSchema", "true") \
                .saveAsTable(table_name)
            
            logger.info(f"Recreated Delta table '{table_name}' with correct schema")
        else:
            # Check if schemas are compatible
            expected_fields = {field.name: field.dataType for field in df_schema.fields}
            existing_fields = {field.name: field.dataType for field in existing_table_schema.fields}
            
            schema_matches = True
            for field_name, expected_type in expected_fields.items():
                if field_name in existing_fields:
                    if str(existing_fields[field_name]) != str(expected_type):
                        logger.warning(f"Schema mismatch for column '{field_name}': expected {expected_type}, found {existing_fields[field_name]}")
                        schema_matches = False
                else:
                    logger.warning(f"Missing column '{field_name}' in existing table")
                    schema_matches = False
            
            if not schema_matches:
                logger.warning(f"Schema mismatch detected. Recreating table '{table_name}' with correct schema...")
                spark.sql(f"DROP TABLE IF EXISTS {table_name}")
                
                empty_df = spark.createDataFrame([], df_schema)
                empty_df.write \
                    .mode("overwrite") \
                    .option("overwriteSchema", "true") \
                    .saveAsTable(table_name)
                
                logger.info(f"Recreated Delta table '{table_name}' with correct schema")
            else:
                logger.info(f"Table '{table_name}' schema is compatible")
                
    except Exception as e:
        # Table doesn't exist or other error, create it
        logger.info(f"Creating Delta table '{table_name}' (reason: {str(e)})")
        
        # Create an empty DataFrame with the correct schema
        empty_df = spark.createDataFrame([], df_schema)
        
        # Create the Delta table
        empty_df.write \
            .mode("overwrite") \
            .option("overwriteSchema", "true") \
            .saveAsTable(table_name)
        
        logger.info(f"Delta table '{table_name}' created successfully")


def fix_void_columns_in_existing_tables():
    """
    Fix VOID column issues in existing tables by recreating them with proper schemas.
    This is specifically needed for SQL Analytics Endpoint compatibility.
    """
    logger.info("=" * 60)
    logger.info("FIXING VOID COLUMNS FOR SQL ANALYTICS ENDPOINT COMPATIBILITY")
    logger.info("=" * 60)
    
    tables_to_fix = [
        CONFIG["ITEMS_TABLE_NAME"],
        CONFIG["CREATORS_TABLE_NAME"]
    ]
    
    for table_name in tables_to_fix:
        try:
            # Check if table exists
            existing_df = spark.table(table_name)
            logger.info(f"Checking table '{table_name}' for VOID columns...")
            
            # Get current schema
            current_schema = existing_df.schema
            has_void_columns = False
            
            for field in current_schema.fields:
                if str(field.dataType) in ['VoidType', 'NullType']:
                    logger.warning(f"Found VOID/NullType column: {field.name} ({field.dataType})")
                    has_void_columns = True
            
            if has_void_columns:
                logger.info(f"Fixing VOID columns in table '{table_name}'...")
                
                # Get the data from existing table
                existing_data = existing_df.collect()
                row_count = len(existing_data)
                logger.info(f"Backing up {row_count} rows from '{table_name}'")
                
                # Define correct schema based on table type
                if table_name == CONFIG["CREATORS_TABLE_NAME"]:
                    correct_schema = StructType([
                        StructField("principal_id", StringType(), False),
                        StructField("display_name", StringType(), True),
                        StructField("principal_type", StringType(), True),
                        StructField("user_principal_name", StringType(), True),
                        StructField("aad_app_id", StringType(), True),
                        StructField("group_type", StringType(), True),
                        StructField("parent_principal_id", StringType(), True),
                        StructField("extraction_timestamp", TimestampType(), False)
                    ])
                else:  # items table
                    correct_schema = StructType([
                        StructField("id", StringType(), False),
                        StructField("type", StringType(), True),
                        StructField("name", StringType(), True),
                        StructField("description", StringType(), True),
                        StructField("state", StringType(), True),
                        StructField("last_updated_date", TimestampType(), True),
                        StructField("workspace_id", StringType(), True),
                        StructField("capacity_id", StringType(), True),
                        StructField("creator_principal_id", StringType(), True),
                        StructField("extraction_timestamp", TimestampType(), False)
                    ])
                
                # Create new DataFrame with correct schema and existing data
                if row_count > 0:
                    # Convert existing data to proper format
                    fixed_data = []
                    for row in existing_data:
                        row_dict = row.asDict()
                        # Convert any None values and ensure proper types
                        for field in correct_schema.fields:
                            if field.name not in row_dict:
                                row_dict[field.name] = None
                            elif row_dict[field.name] == '' or row_dict[field.name] == 'null':
                                row_dict[field.name] = None
                        fixed_data.append(row_dict)
                    
                    # Create pandas DataFrame and then Spark DataFrame
                    import pandas as pd
                    pandas_df = pd.DataFrame(fixed_data)
                    
                    # Create new DataFrame with correct schema
                    new_df = spark.createDataFrame(pandas_df, schema=correct_schema)
                else:
                    # Empty table case
                    new_df = spark.createDataFrame([], correct_schema)
                
                # Drop the old table and recreate
                spark.sql(f"DROP TABLE IF EXISTS {table_name}")
                logger.info(f"Dropped table '{table_name}' with VOID columns")
                
                # Create new table with correct schema
                new_df.write \
                    .mode("overwrite") \
                    .option("overwriteSchema", "true") \
                    .saveAsTable(table_name)
                
                logger.info(f"Recreated table '{table_name}' with proper schema")
                logger.info(f"Restored {row_count} rows to '{table_name}'")
                
                # Verify the fix
                fixed_df = spark.table(table_name)
                logger.info(f"Verification - New schema for '{table_name}':")
                fixed_df.printSchema()
                
            else:
                logger.info(f"Table '{table_name}' schema is already correct - no VOID columns found")
                
        except Exception as e:
            logger.error(f"Failed to check/fix table '{table_name}': {str(e)}")
            # Continue with other tables
            continue
    
    logger.info("VOID column fix process completed!")
    logger.info("Tables should now be compatible with SQL Analytics Endpoint")


def merge_items_to_delta(source_df, table_name: str):
    """
    Merge new items data into the Delta table using MERGE operation.
    
    This function performs an upsert operation:
    - Updates existing records if item ID matches
    - Inserts new records if item ID doesn't exist
    - Handles timestamp validation to prevent Parquet write issues
    
    Args:
        source_df: DataFrame with new items data
        table_name: Name of the target Delta table
    """
    logger.info(f"Starting merge operation for {table_name}")
    
    # Validate and clean the source DataFrame before writing
    # Filter out any records with null IDs
    # Note: last_updated_date filtering is already handled in DataFrame creation
    cleaned_df = source_df.filter(col("id").isNotNull())
    
    row_count_before = source_df.count()
    row_count_after = cleaned_df.count()
    
    if row_count_before != row_count_after:
        logger.warning(f"Filtered out {row_count_before - row_count_after} records with null IDs")
    
    # Validate source DataFrame schema before proceeding
    logger.info("Source DataFrame schema:")
    cleaned_df.printSchema()
    
    # Validate target table schema
    try:
        target_df = spark.table(table_name)
        logger.info("Target table schema:")
        target_df.printSchema()
        
        # Check for VOID/NullType columns in target
        for field in target_df.schema.fields:
            if str(field.dataType) in ['VoidType', 'NullType']:
                logger.error(f"Target table has VOID/NullType column '{field.name}'. Cannot proceed with merge.")
                logger.error("Please recreate the target table with correct schema.")
                raise ValueError(f"Invalid target table schema - column '{field.name}' has type {field.dataType}")
                
    except Exception as e:
        logger.warning(f"Cannot validate target table schema: {str(e)}. Proceeding with merge operation.")
    
    # Create a temporary view for the merge operation
    cleaned_df.createOrReplaceTempView("items_updates")
    
    # If the table is empty, just insert all records
    try:
        if spark.table(table_name).count() == 0:
            logger.info(f"Table {table_name} is empty. Inserting all records.")
            cleaned_df.write.mode("append").saveAsTable(table_name)
            return
    except Exception as e:
        logger.warning(f"Could not check table row count: {str(e)}. Proceeding with merge operation.")
    
    # Perform the merge operation
    merge_query = f"""
    MERGE INTO {table_name} AS target
    USING items_updates AS source
    ON target.id = source.id
    WHEN MATCHED THEN
        UPDATE SET 
            target.type = source.type,
            target.name = source.name,
            target.description = source.description,
            target.state = source.state,
            target.last_updated_date = source.last_updated_date,
            target.workspace_id = source.workspace_id,
            target.capacity_id = source.capacity_id,
            target.creator_principal_id = source.creator_principal_id,
            target.extraction_timestamp = source.extraction_timestamp
    WHEN NOT MATCHED THEN
        INSERT *
    """
    
    try:
        spark.sql(merge_query)
        logger.info("Items merge operation completed successfully")
    except Exception as e:
        logger.error(f"Merge operation failed: {str(e)}")
        logger.info("Attempting fallback: direct insert with overwrite mode")
        try:
            cleaned_df.write.mode("overwrite").option("overwriteSchema", "true").saveAsTable(table_name)
            logger.info("Fallback insert completed successfully")
        except Exception as fallback_e:
            logger.error(f"Fallback insert also failed: {str(fallback_e)}")
            raise


def merge_creators_to_delta(source_df, table_name: str):
    """
    Merge new creators data into the Delta table using MERGE operation.
    
    This function performs an upsert operation:
    - Updates existing records if principal ID matches
    - Inserts new records if principal ID doesn't exist
    - Handles schema validation to prevent type mismatch issues
    
    Args:
        source_df: DataFrame with new creators data
        table_name: Name of the target Delta table
    """
    logger.info(f"Starting merge operation for {table_name}")
    
    # Validate source DataFrame schema before proceeding
    logger.info("Source DataFrame schema:")
    source_df.printSchema()
    
    # Validate target table schema
    try:
        target_df = spark.table(table_name)
        logger.info("Target table schema:")
        target_df.printSchema()
        
        # Check for VOID/NullType columns in target
        for field in target_df.schema.fields:
            if str(field.dataType) in ['VoidType', 'NullType']:
                logger.error(f"Target table has VOID/NullType column '{field.name}'. Cannot proceed with merge.")
                logger.error("Please recreate the target table with correct schema.")
                raise ValueError(f"Invalid target table schema - column '{field.name}' has type {field.dataType}")
                
    except Exception as e:
        logger.error(f"Cannot validate target table schema: {str(e)}")
        raise
    
    # Create a temporary view for the merge operation
    source_df.createOrReplaceTempView("creators_updates")
    
    # If the table is empty, just insert all records
    try:
        if spark.table(table_name).count() == 0:
            logger.info(f"Table {table_name} is empty. Inserting all records.")
            source_df.write.mode("append").saveAsTable(table_name)
            return
    except Exception as e:
        logger.warning(f"Could not check target table row count: {str(e)}. Proceeding with merge operation.")
    
    # Perform the merge operation
    merge_query = f"""
    MERGE INTO {table_name} AS target
    USING creators_updates AS source
    ON target.principal_id = source.principal_id
    WHEN MATCHED THEN
        UPDATE SET 
            target.display_name = source.display_name,
            target.principal_type = source.principal_type,
            target.user_principal_name = source.user_principal_name,
            target.aad_app_id = source.aad_app_id,
            target.group_type = source.group_type,
            target.parent_principal_id = source.parent_principal_id,
            target.extraction_timestamp = source.extraction_timestamp
    WHEN NOT MATCHED THEN
        INSERT *
    """
    
    try:
        spark.sql(merge_query)
        logger.info("Creators merge operation completed successfully")
    except Exception as e:
        logger.error(f"Merge operation failed: {str(e)}")
        logger.info("Attempting fallback: direct insert with overwrite mode")
        try:
            source_df.write.mode("overwrite").option("overwriteSchema", "true").saveAsTable(table_name)
            logger.info("Fallback insert completed successfully")
        except Exception as fallback_e:
            logger.error(f"Fallback insert also failed: {str(fallback_e)}")
            raise


def optimize_delta_table(table_name: str):
    """
    Optimize the Delta table for better query performance.
    
    This function:
    - Updates table statistics for query optimization
    - Uses a compatible method for Microsoft Fabric
    
    Args:
        table_name: Name of the Delta table to optimize
    """
    logger.info(f"Optimizing Delta table '{table_name}'")
    
    try:
        # Update table statistics for better query planning
        spark.sql(f"ANALYZE TABLE {table_name} COMPUTE STATISTICS")
        logger.info("Table statistics updated successfully")
        
        # Note: In Microsoft Fabric, Delta table optimization may be handled automatically
        # or through different commands than traditional Delta Lake
        # The standard OPTIMIZE and ZORDER commands might not be available
        
        # Alternative approach: Use Delta table properties to hint at optimization
        try:
            delta_table = DeltaTable.forName(spark, table_name)
            logger.info("Delta table optimization completed via statistics computation")
            logger.info("Note: Microsoft Fabric may automatically optimize Delta tables")
        except Exception as delta_e:
            logger.info(f"Delta table reference not available: {str(delta_e)}")
            logger.info("Continuing with standard statistics optimization")
        
    except Exception as e:
        logger.warning(f"Table optimization step encountered an issue: {str(e)}")
        logger.info("Continuing with process - optimization is not critical for functionality")
# ==================================


# CELL 10 - Main Execution Function
# ==================================
def main():
    """
    Main execution function that orchestrates the entire process.
    
    This function:
    1. Gets the authentication token
    2. Retrieves all items from the Fabric List Items API
    3. Extracts and normalizes items and creators data
    4. Creates two enhanced PySpark DataFrames
    5. Loads data into two normalized Delta Lake tables
    6. Optimizes the tables for analytics
    7. Provides comprehensive reporting and statistics
    """
    try:
        logger.info("Starting Fabric Items to Delta Lake process")
        logger.info("This process will create two normalized tables:")
        logger.info(f"  1. {CONFIG['ITEMS_TABLE_NAME']} - Core item information")
        logger.info(f"  2. {CONFIG['CREATORS_TABLE_NAME']} - Normalized creator/principal information")
        
        # Step 1: Get authentication token
        logger.info("Getting access token...")
        access_token = get_access_token()
        logger.info("Successfully obtained access token")
        
        # Step 2: Retrieve all items from API
        logger.info("Retrieving items from Fabric List Items API...")
        logger.info("Note: This API supports up to 10,000 items per request with pagination")
        items_data = get_all_items(access_token)
        
        if not items_data:
            logger.warning("No items found. Please check your permissions and API access.")
            logger.warning("Required permissions: Fabric administrator or service principal")
            logger.warning("Required scopes: Tenant.Read.All or Tenant.ReadWrite.All")
            
            # Create empty dataframes with proper schemas for consistent table structure
            items_schema = StructType([
                StructField("id", StringType(), False),
                StructField("type", StringType(), True),
                StructField("name", StringType(), True),
                StructField("description", StringType(), True),
                StructField("state", StringType(), True),
                StructField("last_updated_date", TimestampType(), True),
                StructField("workspace_id", StringType(), True),
                StructField("capacity_id", StringType(), True),
                StructField("creator_principal_id", StringType(), True),
                StructField("extraction_timestamp", TimestampType(), False)
            ])
            
            creators_schema = StructType([
                StructField("principal_id", StringType(), False),
                StructField("display_name", StringType(), True),
                StructField("principal_type", StringType(), True),
                StructField("user_principal_name", StringType(), True),
                StructField("aad_app_id", StringType(), True),
                StructField("group_type", StringType(), True),
                StructField("parent_principal_id", StringType(), True),
                StructField("extraction_timestamp", TimestampType(), False)
            ])
            
            items_df = spark.createDataFrame([], items_schema)
            creators_df = spark.createDataFrame([], creators_schema)
            
        else:
            # Step 3: Extract and normalize data
            logger.info(f"Extracting and normalizing data from {len(items_data)} items...")
            simplified_items, unique_creators = extract_items_and_creators(items_data)
            
            # Step 4: Create DataFrames
            logger.info("Creating optimized DataFrames for Delta Lake storage...")
            items_df = create_items_dataframe(simplified_items)
            creators_df = create_creators_dataframe(unique_creators)
        
        # Show sample data
        logger.info("Sample of items data:")
        items_df.show(5, truncate=False)
        
        logger.info("Sample of creators data:")
        creators_df.show(5, truncate=False)
        
        # Step 5: Prepare and populate Items Delta table
        items_table_name = CONFIG["ITEMS_TABLE_NAME"]
        creators_table_name = CONFIG["CREATORS_TABLE_NAME"]
        
        logger.info("Setting up Delta tables...")
        ensure_delta_table_exists(items_table_name, items_df.schema)
        ensure_delta_table_exists(creators_table_name, creators_df.schema)
        
        # Step 6: Merge data into Delta tables (if we have data)
        if items_data:
            logger.info("Merging data into Delta tables...")
            
            # Merge creators first (since items reference creators)
            merge_creators_to_delta(creators_df, creators_table_name)
            
            # Then merge items
            merge_items_to_delta(items_df, items_table_name)
            
            # Step 7: Optimize the Delta tables
            logger.info("Optimizing Delta tables for query performance...")
            optimize_delta_table(items_table_name)
            optimize_delta_table(creators_table_name)
        
        # Step 8: Fix VOID columns for SQL Analytics Endpoint compatibility
        logger.info("Fixing any VOID column issues for SQL Analytics Endpoint compatibility...")
        fix_void_columns_in_existing_tables()
        
        # Step 9: Display comprehensive statistics and information
        logger.info("=" * 60)
        logger.info("FABRIC ITEMS ETL PROCESS COMPLETED SUCCESSFULLY!")
        logger.info("=" * 60)
        
        # Show table details
        logger.info("Items table details:")
        spark.sql(f"DESCRIBE DETAIL {items_table_name}").show(truncate=False)
        
        logger.info("Creators table details:")
        spark.sql(f"DESCRIBE DETAIL {creators_table_name}").show(truncate=False)
        
        # Show row counts
        items_count = spark.table(items_table_name).count()
        creators_count = spark.table(creators_table_name).count()
        
        logger.info(f"Total rows in {items_table_name}: {items_count}")
        logger.info(f"Total rows in {creators_table_name}: {creators_count}")
        
        # Show comprehensive summary statistics
        if items_count > 0:
            logger.info("=" * 40)
            logger.info("ITEMS SUMMARY STATISTICS")
            logger.info("=" * 40)
            
            summary_stats = spark.sql(f"""
                SELECT 
                    COUNT(*) as total_items,
                    COUNT(DISTINCT workspace_id) as unique_workspaces,
                    COUNT(DISTINCT capacity_id) as unique_capacities,
                    COUNT(DISTINCT type) as unique_item_types,
                    COUNT(DISTINCT state) as unique_states,
                    COUNT(DISTINCT creator_principal_id) as unique_creators,
                    MAX(extraction_timestamp) as last_updated
                FROM {items_table_name}
            """)
            
            summary_stats.show(truncate=False)
            
            # Show item type distribution
            logger.info("Item Type Distribution:")
            type_distribution = spark.sql(f"""
                SELECT 
                    type,
                    COUNT(*) as count,
                    ROUND(COUNT(*) * 100.0 / SUM(COUNT(*)) OVER(), 2) as percentage
                FROM {items_table_name}
                GROUP BY type
                ORDER BY count DESC
            """)
            type_distribution.show(20, truncate=False)
            
            # Show state distribution
            logger.info("Item State Distribution:")
            state_distribution = spark.sql(f"""
                SELECT 
                    state,
                    COUNT(*) as count
                FROM {items_table_name}
                GROUP BY state
                ORDER BY count DESC
            """)
            state_distribution.show(truncate=False)
            
            # Show top workspaces by item count
            logger.info("Top 10 Workspaces by Item Count:")
            workspace_distribution = spark.sql(f"""
                SELECT 
                    workspace_id,
                    COUNT(*) as item_count,
                    COUNT(DISTINCT type) as unique_types
                FROM {items_table_name}
                WHERE workspace_id IS NOT NULL
                GROUP BY workspace_id
                ORDER BY item_count DESC
                LIMIT 10
            """)
            workspace_distribution.show(truncate=False)
        
        if creators_count > 0:
            logger.info("=" * 40)
            logger.info("CREATORS SUMMARY STATISTICS")
            logger.info("=" * 40)
            
            # Show creator type distribution
            creator_type_distribution = spark.sql(f"""
                SELECT 
                    principal_type,
                    COUNT(*) as count,
                    ROUND(COUNT(*) * 100.0 / SUM(COUNT(*)) OVER(), 2) as percentage
                FROM {creators_table_name}
                GROUP BY principal_type
                ORDER BY count DESC
            """)
            creator_type_distribution.show(truncate=False)
            
            # Show top creators by item count (join with items table)
            logger.info("Top 10 Most Active Creators:")
            top_creators = spark.sql(f"""
                SELECT 
                    c.display_name,
                    c.principal_type,
                    c.user_principal_name,
                    COUNT(i.id) as items_created
                FROM {creators_table_name} c
                JOIN {items_table_name} i ON c.principal_id = i.creator_principal_id
                GROUP BY c.principal_id, c.display_name, c.principal_type, c.user_principal_name
                ORDER BY items_created DESC
                LIMIT 10
            """)
            top_creators.show(truncate=False)
        
        # Show sample join query demonstrating the two-table relationship
        if items_count > 0 and creators_count > 0:
            logger.info("=" * 40)
            logger.info("SAMPLE JOIN QUERY - Items with Creator Details")
            logger.info("=" * 40)
            
            sample_join = spark.sql(f"""
                SELECT 
                    i.name as item_name,
                    i.type as item_type,
                    i.state,
                    i.workspace_id,
                    c.display_name as creator_name,
                    c.principal_type as creator_type,
                    c.user_principal_name,
                    i.last_updated_date
                FROM {items_table_name} i
                LEFT JOIN {creators_table_name} c ON i.creator_principal_id = c.principal_id
                ORDER BY i.last_updated_date DESC
                LIMIT 10
            """)
            sample_join.show(truncate=False)
        
        logger.info("=" * 60)
        logger.info("PROCESS COMPLETED - Data is ready for analytics!")
        logger.info("=" * 60)
        logger.info(f"Tables created:")
        logger.info(f"  • {items_table_name} - {items_count} records")
        logger.info(f"  • {creators_table_name} - {creators_count} records")
        logger.info("Use these tables for Fabric governance, compliance, and analytics!")
        
        return items_df, creators_df
        
    except Exception as e:
        logger.error(f"Error in main execution: {str(e)}")
        logger.error("Please check:")
        logger.error("  1. You have Fabric administrator permissions")
        logger.error("  2. Required scopes: Tenant.Read.All or Tenant.ReadWrite.All")
        logger.error("  3. API rate limits (200 requests/hour)")
        logger.error("  4. Network connectivity to api.fabric.microsoft.com")
        raise
# ==================================


# CELL 11 - Execute Main Function
# ==================================
# Execute the main function
if __name__ == "__main__":
    items_df, creators_df = main()
    
    # Optional: Store DataFrames in variables for further analysis
    print("\nDataFrames are available as:")
    print("  - items_df: Main items DataFrame")
    print("  - creators_df: Creators DataFrame")
    print("\nYou can now perform additional analysis or create custom queries!")
# ==================================


# CELL 12 - Optional: Additional Analysis Examples
# ==================================
# OPTIONAL CELL - Uncomment and run for additional analysis examples
# This cell provides examples of how to use the data for common analytics scenarios

"""
# Example 1: Find all items created in the last 30 days
recent_items = spark.sql(f'''
    SELECT 
        i.name,
        i.type,
        c.display_name as creator,
        i.last_updated_date
    FROM {CONFIG["ITEMS_TABLE_NAME"]} i
    LEFT JOIN {CONFIG["CREATORS_TABLE_NAME"]} c ON i.creator_principal_id = c.principal_id
    WHERE i.last_updated_date >= date_sub(current_date(), 30)
    ORDER BY i.last_updated_date DESC
''')

print("Items created/updated in the last 30 days:")
recent_items.show(10, truncate=False)


# Example 2: Workspace utilization analysis
workspace_analysis = spark.sql(f'''
    SELECT 
        workspace_id,
        COUNT(*) as total_items,
        COUNT(DISTINCT type) as item_types,
        COUNT(DISTINCT creator_principal_id) as unique_creators,
        MAX(last_updated_date) as most_recent_activity
    FROM {CONFIG["ITEMS_TABLE_NAME"]}
    WHERE workspace_id IS NOT NULL
    GROUP BY workspace_id
    ORDER BY total_items DESC
''')

print("Workspace utilization analysis:")
workspace_analysis.show(20, truncate=False)


# Example 3: Creator productivity analysis
creator_productivity = spark.sql(f'''
    SELECT 
        c.display_name,
        c.principal_type,
        COUNT(i.id) as items_created,
        COUNT(DISTINCT i.type) as item_types_created,
        COUNT(DISTINCT i.workspace_id) as workspaces_contributed,
        MAX(i.last_updated_date) as most_recent_item
    FROM {CONFIG["CREATORS_TABLE_NAME"]} c
    JOIN {CONFIG["ITEMS_TABLE_NAME"]} i ON c.principal_id = i.creator_principal_id
    GROUP BY c.principal_id, c.display_name, c.principal_type
    HAVING items_created >= 5
    ORDER BY items_created DESC
''')

print("Most productive creators (5+ items):")
creator_productivity.show(15, truncate=False)


# Example 4: Item type distribution by workspace
type_by_workspace = spark.sql(f'''
    SELECT 
        workspace_id,
        type as item_type,
        COUNT(*) as count
    FROM {CONFIG["ITEMS_TABLE_NAME"]}
    WHERE workspace_id IS NOT NULL
    GROUP BY workspace_id, type
    ORDER BY workspace_id, count DESC
''')

print("Item type distribution by workspace:")
type_by_workspace.show(50, truncate=False)
"""

print("Additional analysis examples are available in Cell 12")
print("Uncomment the code in Cell 12 to run example analytics queries")
# ==================================

StatementMeta(, f516eaea-ae7d-4600-9404-46789e43c376, 4, Finished, Available, Finished)

2025-07-16 15:52:18,813 - INFO - API Rate Limit: 200 requests/hour
2025-07-16 15:52:18,813 - INFO - Max records per request: 10000
2025-07-16 15:52:18,819 - INFO - Starting Fabric Items to Delta Lake process
2025-07-16 15:52:18,820 - INFO - This process will create two normalized tables:
2025-07-16 15:52:18,821 - INFO -   1. fabric_items - Core item information
2025-07-16 15:52:18,822 - INFO -   2. fabric_items_creators - Normalized creator/principal information
2025-07-16 15:52:18,822 - INFO - Getting access token...
2025-07-16 15:52:20,227 - INFO - Successfully obtained access token for Fabric API
2025-07-16 15:52:20,227 - INFO - Successfully obtained access token
2025-07-16 15:52:20,228 - INFO - Retrieving items from Fabric List Items API...
2025-07-16 15:52:20,228 - INFO - Note: This API supports up to 10,000 items per request with pagination
2025-07-16 15:52:20,229 - INFO - Starting to retrieve all Fabric items...
2025-07-16 15:52:20,229 - INFO - Items will be retrieved in this or

+------------------------------------+------------+-------------------------+------------+------+--------------------------+------------------------------------+------------------------------------+------------------------------------+--------------------------+
|id                                  |type        |name                     |description |state |last_updated_date         |workspace_id                        |capacity_id                         |creator_principal_id                |extraction_timestamp      |
+------------------------------------+------------+-------------------------+------------+------+--------------------------+------------------------------------+------------------------------------+------------------------------------+--------------------------+
|5c3b932b-b27d-456e-8a40-2b699c3952ab|Lakehouse   |LH_Bronze                |NULL        |Active|2024-09-12 13:53:13.382184|1a95e3bb-a11d-45bc-84d4-5c2c1f1aa990|c73a5223-9ef6-4514-83cc-3e70297ee377|d87d15b5-62cd

In [4]:
from pyspark.sql import SparkSession

# create Spark session
spark = SparkSession.builder.appName("Refresh SQL Endpoint Metadata").getOrCreate()

# refresh the specific table
spark.sql("REFRESH TABLE fabric_items")
print("Metadata refresh triggered successfully.")

spark.sql("REFRESH TABLE fabric_items_creators")
print("Metadata refresh triggered successfully.")


StatementMeta(, f516eaea-ae7d-4600-9404-46789e43c376, 6, Finished, Available, Finished)

Metadata refresh triggered successfully.
Metadata refresh triggered successfully.
