In [None]:
# DO NOT DELETE THIS CELL

"""
This notebook scrapes all the Fabric AND Power BI API documentation and loads portions of it into (4) tables shown below.

https://learn.microsoft.com/en-us/rest/api/fabric/articles/
https://learn.microsoft.com/en-us/rest/api/power-bi/

Loads table: api_endpoints
Loads table: api_documentation
Loads table: api_definitions
Loads table: api_definitions_properties

Use these tables to quickly:
    *filter and identify GET or POST API commands by using [api_documentation].[http_method] column
    *find related APIs by using keyword search on [api_endpoints].[operation_name] or [api_endpoints].[operation_desc]
    *query the API definitions to find relationships between them. Someone create an ERD!

# Known Issues:
1) All [api_documentation].[description] column contains values "This browser is no longer supported." *Use [api_endpoints].[operation_desc] instead which contains the correct values.
2) The [api_documentation].[endpoint_url] column is NULL for ALL Power BI APIs. It is only populated for Fabric API data. The data is located in a 
    different section of the page for Power BI APIs and code has not been added (yet) to scrape it.
3). The Power BI APIs include a VERY small number of definitions and/or property values with the word "error" in them. These were not scraped. Most, if not all, Fabric API 
    definitions have common "error" definitions which I intentionally excluded. "error" is a keyword I'm not scraping.

"""

In [None]:
%%sql
--TRUNCATE TABLE api_endpoints;
--TRUNCATE TABLE api_documentation;
--TRUNCATE TABLE api_definitions;
--TRUNCATE TABLE api_definitions_properties;
--TRUNCATE TABLE api_definition_properties;

--delete from api_endpoints where endpoint_id >= '19';


In [1]:
# =============================================================================
# MICROSOFT FABRIC REST API DOCUMENTATION SCRAPER - ENHANCED WITH DEFINITIONS
# =============================================================================
# This notebook scrapes Microsoft Fabric REST API documentation and stores
# the results in Delta tables with proper relational design.
#
# What this does:
# 1. Discovers API endpoints from overview pages
# 2. Scrapes detailed documentation for each endpoint  
# 3. Extracts API definitions (objects and enums) from documentation
# 4. Stores everything in 4 properly normalized Delta tables
# =============================================================================

# =============================================================================
# BLOCK 1: IMPORTS AND SETUP
# =============================================================================
import requests
import time
import json
import re
import logging
import pytz
from datetime import datetime
from typing import List, Dict, Optional, Tuple
from urllib.parse import urljoin, urlparse
from bs4 import BeautifulSoup
from pyspark.sql import SparkSession
from pyspark.sql.types import *
from pyspark.sql.functions import *
from pyspark.sql.window import Window

# Set up logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)

# Get local timezone (Houston, Texas - Central Time)
LOCAL_TIMEZONE = pytz.timezone('America/Chicago')

def get_local_timestamp():
    """Get current timestamp in local timezone."""
    utc_now = datetime.utcnow().replace(tzinfo=pytz.UTC)
    local_time = utc_now.astimezone(LOCAL_TIMEZONE)
    return local_time.replace(tzinfo=None)

print("✅ All libraries imported successfully!")

# =============================================================================
# BLOCK 2: CONFIGURATION - SUPPLY YOUR OVERVIEW URLS HERE
# =============================================================================
# 🎯 THIS IS WHERE YOU SUPPLY YOUR OVERVIEW URLS!

OVERVIEW_URLS = [
    # Admin APIs
    #"https://learn.microsoft.com/en-us/rest/api/fabric/admin/domains",
    #"https://learn.microsoft.com/en-us/rest/api/fabric/admin/workspaces",
    #"https://learn.microsoft.com/en-us/rest/api/fabric/admin/items",
    #"https://learn.microsoft.com/en-us/rest/api/fabric/admin/users",
    #"https://learn.microsoft.com/en-us/rest/api/fabric/core/connections",
    #"https://learn.microsoft.com/en-us/rest/api/fabric/report/items",
    #"https://learn.microsoft.com/en-us/rest/api/fabric/admin/external-data-shares-provider",
    #"https://learn.microsoft.com/en-us/rest/api/fabric/admin/labels",
    #"https://learn.microsoft.com/en-us/rest/api/fabric/admin/sharing-links",
    #"https://learn.microsoft.com/en-us/rest/api/fabric/admin/tags",
    #"https://learn.microsoft.com/en-us/rest/api/fabric/admin/tenants",
    #"https://learn.microsoft.com/en-us/rest/api/fabric/apacheairflowjob/items",
    #"https://learn.microsoft.com/en-us/rest/api/fabric/copyjob/items",
    #"https://learn.microsoft.com/en-us/rest/api/fabric/core/capacities",
    #"https://learn.microsoft.com/en-us/rest/api/fabric/core/deployment-pipelines",
    #"https://learn.microsoft.com/en-us/rest/api/fabric/core/external-data-shares-provider",
    #"https://learn.microsoft.com/en-us/rest/api/fabric/core/external-data-shares-recipient",
    #"https://learn.microsoft.com/en-us/rest/api/fabric/core/folders",
    #"https://learn.microsoft.com/en-us/rest/api/fabric/core/gateways",
    #"https://learn.microsoft.com/en-us/rest/api/fabric/core/git",
    #"https://learn.microsoft.com/en-us/rest/api/fabric/core/items",
    #"https://learn.microsoft.com/en-us/rest/api/fabric/core/job-scheduler",
    #"https://learn.microsoft.com/en-us/rest/api/fabric/core/long-running-operations",
    #"https://learn.microsoft.com/en-us/rest/api/fabric/core/managed-private-endpoints",
    #"https://learn.microsoft.com/en-us/rest/api/fabric/core/onelake-data-access-security",
    #"https://learn.microsoft.com/en-us/rest/api/fabric/core/onelake-shortcuts",
    #"https://learn.microsoft.com/en-us/rest/api/fabric/core/tags",
    #"https://learn.microsoft.com/en-us/rest/api/fabric/core/workspaces",
    #"https://learn.microsoft.com/en-us/rest/api/fabric/dashboard/items",
    #"https://learn.microsoft.com/en-us/rest/api/fabric/dataflow/background-jobs",
    #"https://learn.microsoft.com/en-us/rest/api/fabric/dataflow/items",
    #"https://learn.microsoft.com/en-us/rest/api/fabric/datapipeline/items",
    #"https://learn.microsoft.com/en-us/rest/api/fabric/digitaltwinbuilder/items",
    #"https://learn.microsoft.com/en-us/rest/api/fabric/digitaltwinbuilderflow/items",
    #"https://learn.microsoft.com/en-us/rest/api/fabric/environment/items",
    #"https://learn.microsoft.com/en-us/rest/api/fabric/environment/spark-compute",
    #"https://learn.microsoft.com/en-us/rest/api/fabric/environment/spark-libraries",
    #"https://learn.microsoft.com/en-us/rest/api/fabric/eventhouse/items",
    #"https://learn.microsoft.com/en-us/rest/api/fabric/eventstream/items",
    #"https://learn.microsoft.com/en-us/rest/api/fabric/eventstream/topology",
    #"https://learn.microsoft.com/en-us/rest/api/fabric/graphqlapi/items",
    #"https://learn.microsoft.com/en-us/rest/api/fabric/kqldatabase/items",
    #"https://learn.microsoft.com/en-us/rest/api/fabric/kqlqueryset/items",
    #"https://learn.microsoft.com/en-us/rest/api/fabric/lakehouse/background-jobs",
    #"https://learn.microsoft.com/en-us/rest/api/fabric/lakehouse/items",
    #"https://learn.microsoft.com/en-us/rest/api/fabric/lakehouse/livy-sessions",
    #"https://learn.microsoft.com/en-us/rest/api/fabric/lakehouse/tables",
    #"https://learn.microsoft.com/en-us/rest/api/fabric/mirroredazuredatabrickscatalog/discovery",
    #"https://learn.microsoft.com/en-us/rest/api/fabric/mirroredazuredatabrickscatalog/items",
    #"https://learn.microsoft.com/en-us/rest/api/fabric/mirroredazuredatabrickscatalog/refresh",
    #"https://learn.microsoft.com/en-us/rest/api/fabric/mirroreddatabase/items",
    #"https://learn.microsoft.com/en-us/rest/api/fabric/mirroreddatabase/mirroring",
    #"https://learn.microsoft.com/en-us/rest/api/fabric/mirroredwarehouse/items",
    #"https://learn.microsoft.com/en-us/rest/api/fabric/mlexperiment/items",
    #"https://learn.microsoft.com/en-us/rest/api/fabric/mlmodel/items",
    #"https://learn.microsoft.com/en-us/rest/api/fabric/mounteddatafactory/items",
    #"https://learn.microsoft.com/en-us/rest/api/fabric/notebook/items",
    #"https://learn.microsoft.com/en-us/rest/api/fabric/notebook/livy-sessions",
    #"https://learn.microsoft.com/en-us/rest/api/fabric/paginatedreport/items",
    #"https://learn.microsoft.com/en-us/rest/api/fabric/reflex/items",
    #"https://learn.microsoft.com/en-us/rest/api/fabric/semanticmodel/items",
    #"https://learn.microsoft.com/en-us/rest/api/fabric/spark/custom-pools",
    #"https://learn.microsoft.com/en-us/rest/api/fabric/spark/livy-sessions",
    #"https://learn.microsoft.com/en-us/rest/api/fabric/spark/workspace-settings",
    #"https://learn.microsoft.com/en-us/rest/api/fabric/sparkjobdefinition/background-jobs",
    #"https://learn.microsoft.com/en-us/rest/api/fabric/sparkjobdefinition/items",
    #"https://learn.microsoft.com/en-us/rest/api/fabric/sparkjobdefinition/livy-sessions",
    #"https://learn.microsoft.com/en-us/rest/api/fabric/sqldatabase/items",
    #"https://learn.microsoft.com/en-us/rest/api/fabric/sqlendpoint/items",
    #"https://learn.microsoft.com/en-us/rest/api/fabric/variablelibrary/items",
    #"https://learn.microsoft.com/en-us/rest/api/fabric/warehouse/items",
    #"https://learn.microsoft.com/en-us/rest/api/fabric/warehousesnapshot/items",
    #"https://learn.microsoft.com/en-us/rest/api/fabric/workload/workloadapi/endpoint-resolution",
    #"https://learn.microsoft.com/en-us/rest/api/fabric/workload/workloadapi/item-lifecycle",
    #"https://learn.microsoft.com/en-us/rest/api/fabric/workload/workloadapi/jobs",
    #"https://learn.microsoft.com/en-us/rest/api/fabric/workload/workloadcontrolapi/workload-control",
    #"https://learn.microsoft.com/en-us/rest/api/power-bi/Admin",
    #"https://learn.microsoft.com/en-us/rest/api/power-bi/Apps",
    #"https://learn.microsoft.com/en-us/rest/api/power-bi/Available-Features",
    #"https://learn.microsoft.com/en-us/rest/api/power-bi/Capacities",
    #"https://learn.microsoft.com/en-us/rest/api/power-bi/Dashboards",
    #"https://learn.microsoft.com/en-us/rest/api/power-bi/Dataflow-Storage-Accounts",
    #"https://learn.microsoft.com/en-us/rest/api/power-bi/Dataflows",
    #"https://learn.microsoft.com/en-us/rest/api/power-bi/Datasets",
    #"https://learn.microsoft.com/en-us/rest/api/power-bi/Embed-Token",
    #"https://learn.microsoft.com/en-us/rest/api/power-bi/Gateways",
    #"https://learn.microsoft.com/en-us/rest/api/power-bi/Groups",
    #"https://learn.microsoft.com/en-us/rest/api/power-bi/Imports",
    #"https://learn.microsoft.com/en-us/rest/api/power-bi/Pipelines",
    #"https://learn.microsoft.com/en-us/rest/api/power-bi/Profiles",
    #"https://learn.microsoft.com/en-us/rest/api/power-bi/Push-Datasets",
    #"https://learn.microsoft.com/en-us/rest/api/power-bi/Reports",
    #"https://learn.microsoft.com/en-us/rest/api/power-bi/Template-Apps",
    #"https://learn.microsoft.com/en-us/rest/api/power-bi/Users"
    #"https://learn.microsoft.com/en-us/rest/api/power-bi/Scorecards(Preview)",
    #"https://learn.microsoft.com/en-us/rest/api/power-bi/Goal-Notes(Preview)",
    #"https://learn.microsoft.com/en-us/rest/api/power-bi/Goal-Values(Preview)",
    "https://learn.microsoft.com/en-us/rest/api/power-bi/Goals(Preview)",
    "https://learn.microsoft.com/en-us/rest/api/power-bi/Goals-Status-Rules(Preview)",

]

print(f"📋 Configured {len(OVERVIEW_URLS)} overview URLs to process")
for i, url in enumerate(OVERVIEW_URLS, 1):
    print(f"   {i}. {url}")

# =============================================================================
# BLOCK 3: HELPER FUNCTIONS
# =============================================================================

def make_safe_request(session, url: str, max_retries: int = 3) -> Optional[requests.Response]:
    """Safely make an HTTP request with retry logic."""
    for attempt in range(max_retries):
        try:
            print(f"   📡 Fetching: {url} (attempt {attempt + 1})")
            response = session.get(url, timeout=30)
            
            if response.status_code == 200:
                print(f"   ✅ Success!")
                return response
            elif response.status_code == 429:
                wait_time = 2 ** attempt * 3
                print(f"   ⏳ Rate limited. Waiting {wait_time} seconds...")
                time.sleep(wait_time)
            else:
                print(f"   ❌ HTTP {response.status_code}")
                
        except Exception as e:
            print(f"   ❌ Error: {e}")
            if attempt < max_retries - 1:
                time.sleep(2 ** attempt)
    
    return None

def clean_description_text(description: str) -> str:
    """Clean description text by removing unwanted content."""
    if not description:
        return ""
    
    # Remove common preview notices and note prefixes
    unwanted_patterns = [
        r'Note\s*This API is in preview\.?',
        r'NoteThis API is in preview\.?',
        r'Note\s*',
        r'This API is in preview\.?',
        r'Preview\s*',
        r'^\s*Note\s*',
    ]
    
    cleaned = description
    for pattern in unwanted_patterns:
        cleaned = re.sub(pattern, '', cleaned, flags=re.IGNORECASE)
    
    # Clean up extra whitespace
    cleaned = re.sub(r'\s+', ' ', cleaned).strip()
    
    # If the description is now empty or too short, return empty
    if len(cleaned) < 10:
        return ""
    
    return cleaned

def extract_api_operations_from_page(html_content: str, overview_url: str) -> List[Dict]:
    """Extract individual API operations from an overview page."""
    soup = BeautifulSoup(html_content, 'html.parser')
    operations = []
    
    # Look for the operations table
    tables = soup.find_all('table')
    
    for table in tables:
        rows = table.find_all('tr')
        
        for row in rows:
            cells = row.find_all('td')
            if len(cells) >= 2:
                
                # First cell should contain the operation link
                link_cell = cells[0]
                operation_link = link_cell.find('a', href=lambda x: x and not x.startswith('http'))
                
                if not operation_link:
                    continue
                
                href = operation_link.get('href', '')
                operation_name = operation_link.get_text(strip=True)
                
                # Filter for actual API operation links
                if not (href and '/' in href and not href.startswith('#') and 
                       not 'articles' in href and not 'quickstart' in href):
                    continue
                
                if not operation_name:
                    continue
                
                # Second cell contains description - extract from <p> tag, ignore NOTE divs
                desc_cell = cells[1]
                
                # Remove all NOTE divs from the cell before extracting description
                for note_div in desc_cell.find_all('div', class_='NOTE'):
                    note_div.decompose()
                
                # Now find the <p> tag with the actual description
                description = ""
                p_tag = desc_cell.find('p')
                if p_tag:
                    description = clean_description_text(p_tag.get_text(strip=True))
                
                # Build the full URL for this operation
                operation_url = urljoin(overview_url, href)
                
                # Extract category and service from the URL
                url_parts = overview_url.split('/')
                category = url_parts[-2] if len(url_parts) >= 2 else "unknown"
                service = url_parts[-1] if len(url_parts) >= 1 else "unknown"
                
                operation = {
                    'category': category,
                    'overview_url': overview_url,
                    'service': service,
                    'operation_name': operation_name,
                    'operation_desc': description,
                    'operation_url': operation_url,
                    'discovered_timestamp': get_local_timestamp()
                }
                operations.append(operation)
    
    # Fallback: if no table found, try alternative method
    if not operations:
        print("   ⚠️ No table found, trying alternative extraction method...")
        operations = extract_operations_fallback_method(soup, overview_url)
    
    return operations

def extract_operations_fallback_method(soup: BeautifulSoup, overview_url: str) -> List[Dict]:
    """Fallback method for pages that don't use the standard table format."""
    operations = []
    
    # Find all links that point to API operations
    operation_links = soup.find_all('a', href=lambda x: x and not x.startswith('http'))
    
    for link in operation_links:
        href = link.get('href', '')
        
        # Filter for actual API operation links
        if (href and '/' in href and not href.startswith('#') and 
            not 'articles' in href and not 'quickstart' in href):
            
            operation_name = link.get_text(strip=True)
            if not operation_name:
                continue
            
            # Try to find description in nearby content
            description = ""
            
            # Look for description in parent row or container
            parent_row = link.find_parent('tr')
            if parent_row:
                cells = parent_row.find_all('td')
                for cell in cells:
                    # Remove NOTE divs
                    for note_div in cell.find_all('div', class_='NOTE'):
                        note_div.decompose()
                    
                    # Find <p> tag
                    p_tag = cell.find('p')
                    if p_tag:
                        desc_text = p_tag.get_text(strip=True)
                        if desc_text and len(desc_text) > 10:
                            description = clean_description_text(desc_text)
                            break
            
            # Build the full URL for this operation
            operation_url = urljoin(overview_url, href)
            
            # Extract category and service from the URL
            url_parts = overview_url.split('/')
            category = url_parts[-2] if len(url_parts) >= 2 else "unknown"
            service = url_parts[-1] if len(url_parts) >= 1 else "unknown"
            
            operation = {
                'category': category,
                'overview_url': overview_url,
                'service': service,
                'operation_name': operation_name,
                'operation_desc': description,
                'operation_url': operation_url,
                'discovered_timestamp': get_local_timestamp()
            }
            operations.append(operation)
    
    return operations

print("✅ Helper functions defined!")

# =============================================================================
# BLOCK 4: ENHANCED DELTA TABLE SETUP WITH PROPER SCHEMA - APPEND MODE
# =============================================================================

# Define schemas at module level for reuse
PROPERTIES_SCHEMA = StructType([
    StructField("property_id", LongType(), False),  # Primary Key
    StructField("definition_id", LongType(), True),  # Foreign Key
    StructField("property_name", StringType(), True),
    StructField("property_type", StringType(), True),
    StructField("property_description", StringType(), True),
    StructField("is_required", BooleanType(), True),
    StructField("is_enum_value", BooleanType(), True),
    StructField("property_order", LongType(), True),
    StructField("discovered_timestamp", TimestampType(), True)
])

def table_exists(table_name: str) -> bool:
    """Check if a table exists in the catalog."""
    try:
        spark.sql(f"DESCRIBE {table_name}")
        return True
    except Exception:
        return False

def initialize_enhanced_delta_tables():
    """Create the enhanced 4-table schema with proper primary/foreign keys if they don't exist."""
    print("🗃️ Setting up enhanced Delta tables with proper relationships...")
    
    # Schema for api_endpoints table (replaces api_discovery)
    endpoints_schema = StructType([
        StructField("endpoint_id", LongType(), False),  # Primary Key
        StructField("category", StringType(), True),
        StructField("overview_url", StringType(), True),
        StructField("service", StringType(), True),
        StructField("operation_name", StringType(), True),
        StructField("operation_desc", StringType(), True),
        StructField("operation_url", StringType(), True),
        StructField("discovered_timestamp", TimestampType(), True)
    ])
    
    # Schema for api_documentation table (enhanced, no parameters)
    documentation_schema = StructType([
        StructField("doc_id", LongType(), False),  # Primary Key
        StructField("endpoint_id", LongType(), True),  # Foreign Key
        StructField("source_url", StringType(), True),
        StructField("api_category", StringType(), True),
        StructField("operation_name", StringType(), True),
        StructField("description", StringType(), True),
        StructField("http_method", StringType(), True),
        StructField("endpoint_url", StringType(), True),
        StructField("discovered_timestamp", TimestampType(), True)
    ])
    
    # Schema for api_definitions table (new)
    definitions_schema = StructType([
        StructField("definition_id", LongType(), False),  # Primary Key
        StructField("doc_id", LongType(), True),  # Foreign Key
        StructField("definition_name", StringType(), True),
        StructField("definition_type", StringType(), True),  # "Object" or "Enum"
        StructField("definition_description", StringType(), True),
        StructField("is_error_type", BooleanType(), True),
        StructField("discovered_timestamp", TimestampType(), True)
    ])
    
    # Schema for api_definitions_properties table (renamed)
    properties_schema = PROPERTIES_SCHEMA
    
    # Create tables only if they don't exist
    table_configs = [
        ("api_endpoints", endpoints_schema),
        ("api_documentation", documentation_schema),
        ("api_definitions", definitions_schema),
        ("api_definitions_properties", properties_schema)
    ]
    
    for table_name, schema in table_configs:
        if not table_exists(table_name):
            print(f"🏗️ Creating {table_name} table...")
            empty_df = spark.createDataFrame([], schema)
            empty_df.write.format("delta").mode("overwrite").saveAsTable(table_name)
            print(f"✅ {table_name} table created successfully!")
        else:
            print(f"✅ {table_name} table already exists, skipping creation")
    
    print("✅ Enhanced Delta tables setup complete!")
    
    # Show the schemas
    print("\n📋 Table schemas:")
    for table_name, _ in table_configs:
        print(f"\n{table_name}:")
        spark.sql(f"DESCRIBE {table_name}").show()

# Run the enhanced table setup
initialize_enhanced_delta_tables()

# =============================================================================
# BLOCK 5: PHASE 1 - API DISCOVERY WITH ENHANCED STORAGE - APPEND MODE
# =============================================================================

def get_next_endpoint_id() -> int:
    """Get the next available endpoint_id."""
    try:
        max_result = spark.sql("SELECT COALESCE(MAX(endpoint_id), 0) as max_id FROM api_endpoints").collect()
        return max_result[0]['max_id'] + 1
    except:
        return 1

def discover_all_api_endpoints(overview_urls: List[str]) -> Tuple[int, int]:
    """Phase 1: Discover all API endpoints and store with auto-incrementing IDs.
    
    Returns:
        Tuple[int, int]: (number_of_discovered_endpoints, starting_endpoint_id)
    """
    print("\n🔍 PHASE 1: DISCOVERING API ENDPOINTS")
    print("="*50)
    
    # OPTION 2 FIX: Capture the starting endpoint_id BEFORE discovery
    starting_endpoint_id = get_next_endpoint_id()
    print(f"📊 Starting endpoint_id for this session: {starting_endpoint_id}")
    
    # Set up a web session for making requests
    session = requests.Session()
    session.headers.update({
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36',
        'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8'
    })
    
    all_discovered_endpoints = []
    
    for i, overview_url in enumerate(overview_urls, 1):
        print(f"\n📋 Processing overview page {i}/{len(overview_urls)}")
        print(f"   URL: {overview_url}")
        
        # Fetch the overview page
        response = make_safe_request(session, overview_url)
        if not response:
            print(f"   ❌ Failed to fetch overview page")
            continue
        
        try:
            # Extract API operations from this page
            endpoints = extract_api_operations_from_page(response.text, overview_url)
            all_discovered_endpoints.extend(endpoints)
            print(f"   ✅ Found {len(endpoints)} API operations")
            
            # Show what we found
            for endpoint in endpoints[:3]:
                print(f"      - {endpoint['operation_name']}")
            if len(endpoints) > 3:
                print(f"      - ... and {len(endpoints) - 3} more")
                
        except Exception as e:
            print(f"   ❌ Error processing page: {e}")
        
        # Be polite - wait between requests
        time.sleep(1.5)
    
    print(f"\n📊 Discovery complete! Found {len(all_discovered_endpoints)} total API endpoints")
    
    # Save to enhanced table with auto-incrementing endpoint_id
    if all_discovered_endpoints:
        print("💾 Saving discovered endpoints to api_endpoints table...")
        
        # Create DataFrame and add endpoint_id starting from the captured starting ID
        endpoints_df = spark.createDataFrame(all_discovered_endpoints)
        endpoints_with_id = endpoints_df.withColumn(
            "endpoint_id", 
            (row_number().over(Window.orderBy("category", "service", "operation_name")) + starting_endpoint_id - 1).cast("bigint")
        ).select(
            col("endpoint_id"),
            col("category"), 
            col("overview_url"), 
            col("service"), 
            col("operation_name"), 
            col("operation_desc"), 
            col("operation_url"), 
            col("discovered_timestamp")
        )
        
        # Write in append mode
        endpoints_with_id.write.format("delta").mode("append").option("mergeSchema", "false").saveAsTable("api_endpoints")
        print("✅ Endpoints saved to 'api_endpoints' table with endpoint_id assignments")
        
        # Show preview
        print("\n📊 Sample of newly discovered endpoints:")
        newest_endpoints = spark.sql(f"""
            SELECT endpoint_id, category, service, operation_name,
                   CASE 
                       WHEN LENGTH(operation_desc) > 50 
                       THEN CONCAT(SUBSTRING(operation_desc, 1, 50), '...') 
                       ELSE operation_desc 
                   END as description_preview
            FROM api_endpoints 
            WHERE endpoint_id >= {starting_endpoint_id}
            ORDER BY endpoint_id
            LIMIT 10
        """)
        newest_endpoints.show(truncate=False)
        
        return len(all_discovered_endpoints), starting_endpoint_id
    
    return 0, starting_endpoint_id

# Run the discovery and capture the session info
total_discovered, session_starting_id = discover_all_api_endpoints(OVERVIEW_URLS)

# =============================================================================
# BLOCK 6: DEFINITIONS EXTRACTION FUNCTIONS - COMPLETELY FIXED
# =============================================================================

def extract_definitions_from_api_page(soup: BeautifulSoup) -> Tuple[List[Dict], List[Dict]]:
    """
    Extract API definitions (objects and enums) from the Definitions section.
    COMPLETELY FIXED: Now properly handles ALL definitions including A-D starting letters.
    """
    definitions = []
    properties = []
    
    # Find the "Definitions" heading. Prioritize higher-level headings first.
    definitions_heading = None
    for tag_name in ['h1', 'h2', 'h3', 'h4']:
        headings = soup.find_all(tag_name, string=re.compile(r'Definitions', re.IGNORECASE))
        if headings:
            definitions_heading = headings[0]
            break

    if not definitions_heading:
        print("   ⚠️ No Definitions section found")
        return definitions, properties

    print(f"   📍 Found Definitions section: {definitions_heading.name}")

    current_definition_name = None
    current_definition_description = ""
    current_definition_type = "Object"
    current_definition_is_error = False

    # COMPLETELY REWRITTEN: Process all content until we find a definitive section end
    current_element = definitions_heading.find_next_sibling()
    
    while current_element:
        if hasattr(current_element, 'name'):
            # Check if this is a heading
            if current_element.name in ['h1', 'h2', 'h3', 'h4']:
                heading_text = current_element.get_text(strip=True)
                
                # FIXED: Define what constitutes a section end more precisely
                # Only end if we find specific non-definition section headers
                section_end_markers = [
                    'Parameters', 'Responses', 'Examples', 'See Also', 
                    'HTTP request', 'Request', 'Response', 'Remarks', 
                    'Code samples', 'Prerequisites', 'Permissions',
                    'Request body', 'Response body', 'Sample request',
                    'Sample response', 'Status codes', 'Return value'
                ]
                
                is_section_end = any(
                    re.match(f'^{re.escape(marker)}$', heading_text, re.IGNORECASE) 
                    for marker in section_end_markers
                )
                
                if is_section_end:
                    print(f"   📍 Found section end: {heading_text}")
                    break
                
                # Skip "Definitions" sub-headers
                if re.match(r'^Definitions$', heading_text, re.IGNORECASE):
                    current_element = current_element.find_next_sibling()
                    continue
                
                # FIXED: This is a definition name - save previous definition if exists
                if current_definition_name and not current_definition_is_error:
                    print(f"   ✅ Saving definition: {current_definition_name} ({current_definition_type})")
                    definition_data = {
                        'definition_name': current_definition_name,
                        'definition_type': current_definition_type,
                        'definition_description': current_definition_description,
                        'is_error_type': False,
                        'discovered_timestamp': get_local_timestamp()
                    }
                    definitions.append(definition_data)
                
                # Start new definition
                current_definition_name = heading_text
                current_definition_description = ""
                current_definition_type = "Object"
                current_definition_is_error = bool(re.search(r'error', heading_text, re.IGNORECASE))
                
                if current_definition_is_error:
                    print(f"   🚫 Skipping error definition: {current_definition_name}")
                else:
                    print(f"   🔍 Processing definition: {current_definition_name}")
                
                # Look for description in nearby content - more flexible approach
                current_definition_description = ""
                next_elem = current_element.find_next_sibling()
                
                # Look through several following elements to find the description
                search_count = 0
                while next_elem and search_count < 5:  # Look at up to 5 following elements
                    if hasattr(next_elem, 'name'):
                        # If we hit another heading or table, stop looking
                        if next_elem.name in ['h1', 'h2', 'h3', 'h4', 'table']:
                            break
                        # If we find a paragraph, extract the description
                        elif next_elem.name == 'p':
                            desc_text = clean_description_text(next_elem.get_text(strip=True))
                            if desc_text and len(desc_text) > 10:  # Only use substantial descriptions
                                current_definition_description = desc_text
                                break
                    next_elem = next_elem.find_next_sibling()
                    search_count += 1
                
                # If we still don't have a description, try looking within the heading element itself
                if not current_definition_description:
                    # Sometimes descriptions are in the same element or nearby
                    parent = current_element.find_parent()
                    if parent:
                        nearby_paragraphs = parent.find_all('p', limit=3)
                        for p in nearby_paragraphs:
                            desc_text = clean_description_text(p.get_text(strip=True))
                            if desc_text and len(desc_text) > 10:
                                current_definition_description = desc_text
                                break
            
            # Process tables (property definitions)
            elif current_element.name == 'table' and current_definition_name and not current_definition_is_error:
                print(f"   📊 Processing properties table for: {current_definition_name}")
                
                table = current_element
                headers = table.find_all('th')
                
                if headers:
                    header_texts = [h.get_text(strip=True).lower() for h in headers]
                    
                    # Determine if it's an Enum table
                    if 'value' in header_texts and 'description' in header_texts:
                        current_definition_type = "Enum"
                        print(f"   🏷️ Detected as Enum type")
                    else:
                        print(f"   🏗️ Detected as Object type")
                    
                    rows = table.find_all('tr')[1:]  # Skip header
                    property_count = 0
                    
                    for order, row in enumerate(rows):
                        cells = row.find_all(['td', 'th'])
                        if len(cells) >= 2:
                            prop_name = ""
                            prop_type = ""
                            prop_desc = ""
                            is_enum_value = False
                            is_required = False
                            
                            if current_definition_type == "Enum":
                                prop_name = cells[0].get_text(strip=True)
                                prop_desc = cells[1].get_text(strip=True) if len(cells) > 1 else ""
                                is_enum_value = True
                            else:
                                prop_name = cells[0].get_text(strip=True)
                                prop_type = cells[1].get_text(strip=True) if len(cells) > 1 else ""
                                prop_desc = cells[2].get_text(strip=True) if len(cells) > 2 else ""
                                row_text = row.get_text().lower()
                                is_required = 'required' in row_text
                            
                            # Skip error-related properties
                            if prop_name and not re.search(r'error', prop_name, re.IGNORECASE):
                                property_data = {
                                    'definition_name': current_definition_name,
                                    'property_name': prop_name,
                                    'property_type': prop_type,
                                    'property_description': prop_desc[:500],
                                    'is_required': bool(is_required),
                                    'is_enum_value': bool(is_enum_value),
                                    'property_order': int(order),
                                    'discovered_timestamp': get_local_timestamp()
                                }
                                properties.append(property_data)
                                property_count += 1
                            elif prop_name:
                                print(f"   🚫 Skipping error property: {prop_name}")
                    
                    print(f"   ✅ Found {property_count} properties for {current_definition_name}")
        
        # Move to next element
        current_element = current_element.find_next_sibling()
    
    # Save the last definition if it exists and isn't an error
    if current_definition_name and not current_definition_is_error:
        print(f"   ✅ Saving final definition: {current_definition_name} ({current_definition_type})")
        definition_data = {
            'definition_name': current_definition_name,
            'definition_type': current_definition_type,
            'definition_description': current_definition_description,
            'is_error_type': False,
            'discovered_timestamp': get_local_timestamp()
        }
        definitions.append(definition_data)
    
    print(f"   🎉 TOTAL: Found {len(definitions)} definitions with {len(properties)} properties")
    return definitions, properties

def extract_description_from_api_page(soup: BeautifulSoup) -> str:
    """Extract the main description of what this API does."""
    description_selectors = [
        'p:first-of-type',
        '.description',
        'div[class*="summary"] p'
    ]
    
    for selector in description_selectors:
        elements = soup.select(selector)
        for elem in elements:
            text = elem.get_text(strip=True)
            if text and len(text) > 20:
                return text[:1000]
    return ""

def extract_http_method_and_endpoint(soup: BeautifulSoup) -> Tuple[str, str]:
    """Extract the HTTP method and API endpoint URL."""
    http_method = ""
    endpoint_url = ""
    
    # Look in code blocks for HTTP method patterns
    code_blocks = soup.find_all(['code', 'pre'])
    
    for block in code_blocks:
        text = block.get_text(strip=True)
        
        # Look for HTTP method
        method_match = re.search(r'\b(GET|POST|PUT|DELETE|PATCH)\s+', text)
        if method_match:
            http_method = method_match.group(1)
            
            # Look for the API URL in the same block
            url_match = re.search(r'https://api\.fabric\.microsoft\.com[^\s\n]+', text)
            if url_match:
                endpoint_url = url_match.group(0)
                break
    
    return http_method, endpoint_url

print("✅ COMPLETELY FIXED definitions extraction functions ready!")

# =============================================================================
# BLOCK 7: PHASE 2 - ENHANCED API DOCUMENTATION SCRAPING - APPEND MODE (FIXED)
# =============================================================================

def scrape_single_api_with_definitions(url: str, endpoint_id: int, session: requests.Session) -> Tuple[Optional[Dict], List[Dict], List[Dict]]:
    """Scrape API documentation and definitions from a single API page."""
    response = make_safe_request(session, url)
    if not response:
        return None, [], []
    
    soup = BeautifulSoup(response.text, 'html.parser')
    
    # Extract API documentation
    url_parts = url.split('/')
    operation_name = url_parts[-1].replace('-', ' ').title()
    api_category = url_parts[-3] if len(url_parts) >= 3 else ""
    
    description = extract_description_from_api_page(soup)
    http_method, endpoint_url = extract_http_method_and_endpoint(soup)
    
    doc_data = {
        'endpoint_id': endpoint_id,
        'source_url': url,
        'api_category': api_category,
        'operation_name': operation_name,
        'description': description,
        'http_method': http_method,
        'endpoint_url': endpoint_url,
        'discovered_timestamp': get_local_timestamp()
    }
    
    # Extract definitions
    definitions, properties = extract_definitions_from_api_page(soup)
    
    return doc_data, definitions, properties

def get_next_available_ids() -> Tuple[int, int, int]:
    """Get the next available IDs for doc_id, definition_id, and property_id."""
    try:
        # Get max doc_id
        max_doc_result = spark.sql("SELECT COALESCE(MAX(doc_id), 0) as max_id FROM api_documentation").collect()
        next_doc_id = max_doc_result[0]['max_id'] + 1
        
        # Get max definition_id
        max_def_result = spark.sql("SELECT COALESCE(MAX(definition_id), 0) as max_id FROM api_definitions").collect()
        next_definition_id = max_def_result[0]['max_id'] + 1
        
        # Get max property_id
        max_prop_result = spark.sql("SELECT COALESCE(MAX(property_id), 0) as max_id FROM api_definitions_properties").collect()
        next_property_id = max_prop_result[0]['max_id'] + 1
        
        return next_doc_id, next_definition_id, next_property_id
        
    except:
        # If tables are empty, start from 1
        return 1, 1, 1

def save_enhanced_batch(batch_docs: List[Dict], batch_definitions: List[Dict], batch_properties: List[Dict]) -> bool:
    """Save batch data to all three tables with proper relationships."""
    try:
        # Save documentation first (with doc_id)
        if batch_docs:
            doc_df = spark.createDataFrame(batch_docs)
            doc_df.write.format("delta").mode("append").option("mergeSchema", "false").saveAsTable("api_documentation")
        
        # Save definitions (with definition_id and doc_id foreign key)
        if batch_definitions:
            def_df = spark.createDataFrame(batch_definitions)
            def_df.write.format("delta").mode("append").option("mergeSchema", "false").saveAsTable("api_definitions")
        
        # Save properties (with property_id and definition_id foreign key)
        if batch_properties:
            # Ensure proper types for properties - cast to long/bigint to match schema
            for prop in batch_properties:
                prop['property_id'] = int(prop['property_id'])  # Will be cast to long by Spark
                prop['definition_id'] = int(prop['definition_id'])  # Will be cast to long by Spark
                prop['property_order'] = int(prop['property_order'])  # Will be cast to long by Spark
                prop['is_required'] = bool(prop['is_required'])
                prop['is_enum_value'] = bool(prop['is_enum_value'])
            
            # Create DataFrame with explicit schema to ensure type matching
            prop_df = spark.createDataFrame(batch_properties, schema=PROPERTIES_SCHEMA)
            prop_df.write.format("delta").mode("append").saveAsTable("api_definitions_properties")
        
        return True
        
    except Exception as e:
        print(f"❌ Save error: {e}")
        return False

def scrape_all_discovered_apis_enhanced(session_starting_id: int, batch_size: int = 3) -> Tuple[int, int, int]:
    """Phase 2: Scrape detailed documentation and definitions for newly discovered APIs only.
    
    Args:
        session_starting_id: The starting endpoint_id from the current discovery session
        batch_size: Number of APIs to process before saving to database
    
    Returns:
        Tuple[int, int, int]: (scraped_docs, scraped_definitions, scraped_properties)
    """
    print("\n📚 PHASE 2: SCRAPING ENHANCED API DOCUMENTATION WITH DEFINITIONS")
    print("="*50)
    
    # OPTION 2 FIX: Only get endpoints discovered in this session
    session_endpoints_df = spark.sql(f"""
        SELECT endpoint_id, operation_url, category, service, operation_name
        FROM api_endpoints
        WHERE endpoint_id >= {session_starting_id}
        ORDER BY endpoint_id
    """)
    
    session_endpoints = session_endpoints_df.collect()
    total_endpoints = len(session_endpoints)
    
    if total_endpoints == 0:
        print("⚠️ No new API endpoints found to scrape in this session!")
        return 0, 0, 0
    
    print(f"📋 Found {total_endpoints} NEW APIs to scrape (endpoint_id >= {session_starting_id})")
    
    # Show what we're about to scrape
    print("🎯 APIs to be scraped in this session:")
    for i, row in enumerate(session_endpoints[:5]):  # Show first 5
        print(f"   {i+1}. {row['operation_name']} (ID: {row['endpoint_id']})")
    if total_endpoints > 5:
        print(f"   ... and {total_endpoints - 5} more")
    
    # Set up web session
    session = requests.Session()
    session.headers.update({
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
    })
    
    # Get starting IDs
    next_doc_id, next_definition_id, next_property_id = get_next_available_ids()
    
    scraped_docs = 0
    scraped_definitions = 0
    scraped_properties = 0
    
    batch_docs = []
    batch_definitions = []
    batch_properties = []
    
    for i, row in enumerate(session_endpoints):
        endpoint_id = row['endpoint_id']
        url = row['operation_url']
        operation_name = row['operation_name']
        
        print(f"\n📄 Scraping {i+1}/{total_endpoints}: {operation_name}")
        print(f"   Endpoint ID: {endpoint_id}")
        print(f"   URL: {url}")
        
        try:
            doc_data, definitions, properties = scrape_single_api_with_definitions(url, endpoint_id, session)
            
            if doc_data:
                # Assign doc_id
                doc_data['doc_id'] = next_doc_id
                batch_docs.append(doc_data)
                scraped_docs += 1
                
                print(f"   ✅ Documentation scraped!")
                if doc_data['http_method'] and doc_data['endpoint_url']:
                    print(f"      Method: {doc_data['http_method']}")
                    print(f"      Endpoint: {doc_data['endpoint_url'][:80]}...")
                
                # FIXED: Create copies of properties for each definition to avoid reference issues
                current_doc_properties = []
                
                # Process definitions for this doc
                for definition in definitions:
                    definition['doc_id'] = next_doc_id
                    definition['definition_id'] = next_definition_id
                    batch_definitions.append(definition)
                    
                    # Find properties for this specific definition
                    definition_name = definition['definition_name']
                    properties_for_this_def = [p for p in properties if p['definition_name'] == definition_name]
                    
                    print(f"      📋 Linking {len(properties_for_this_def)} properties to {definition_name}")
                    
                    # Process properties for this definition
                    for prop in properties_for_this_def:
                        # Create a copy to avoid modifying the original
                        prop_copy = prop.copy()
                        prop_copy['definition_id'] = next_definition_id
                        prop_copy['property_id'] = next_property_id
                        
                        # Remove the temporary definition_name field
                        if 'definition_name' in prop_copy:
                            del prop_copy['definition_name']
                        
                        current_doc_properties.append(prop_copy)
                        next_property_id += 1
                    
                    next_definition_id += 1
                
                # Add all properties for this document to the batch
                batch_properties.extend(current_doc_properties)
                
                scraped_definitions += len(definitions)
                scraped_properties += len(current_doc_properties)
                
                print(f"      ✅ Processed: {len(definitions)} definitions, {len(current_doc_properties)} properties")
                
                next_doc_id += 1
            else:
                print(f"   ❌ Failed to scrape documentation")
            
            # Save batch when it's full
            if len(batch_docs) >= batch_size:
                success = save_enhanced_batch(batch_docs, batch_definitions, batch_properties)
                if success:
                    print(f"   💾 Batch of {len(batch_docs)} docs, {len(batch_definitions)} definitions, {len(batch_properties)} properties saved!")
                batch_docs = []
                batch_definitions = []
                batch_properties = []
                
        except Exception as e:
            print(f"   ❌ Error: {e}")
            import traceback
            traceback.print_exc()
        
        # Be polite - wait between requests
        time.sleep(2.0)
    
    # Save any remaining data in the last batch
    if batch_docs or batch_definitions or batch_properties:
        success = save_enhanced_batch(batch_docs, batch_definitions, batch_properties)
        if success:
            print(f"   💾 Final batch of {len(batch_docs)} docs, {len(batch_definitions)} definitions, {len(batch_properties)} properties saved!")
    
    print(f"\n🎉 Enhanced scraping complete!")
    print(f"   📚 Successfully scraped {scraped_docs} API documentations")
    print(f"   📋 Successfully scraped {scraped_definitions} definitions")
    print(f"   🔧 Successfully scraped {scraped_properties} properties")
    
    return scraped_docs, scraped_definitions, scraped_properties

# OPTION 2 FIX: Run the enhanced scraping with session filtering
docs_count, definitions_count, properties_count = scrape_all_discovered_apis_enhanced(session_starting_id, batch_size=3)

# =============================================================================
# BLOCK 8: ENHANCED RESULTS AND ANALYTICS
# =============================================================================

def show_enhanced_scraping_results():
    """Display comprehensive results from the enhanced scraping."""
    print("\n" + "="*70)
    print("🎉 ENHANCED SCRAPING COMPLETE - COMPREHENSIVE RESULTS")
    print("="*70)
    
    # Count all tables
    endpoints_count = spark.sql("SELECT COUNT(*) as count FROM api_endpoints").collect()[0]['count']
    docs_count = spark.sql("SELECT COUNT(*) as count FROM api_documentation").collect()[0]['count']
    definitions_count = spark.sql("SELECT COUNT(*) as count FROM api_definitions").collect()[0]['count']
    properties_count = spark.sql("SELECT COUNT(*) as count FROM api_definitions_properties").collect()[0]['count']
    
    print(f"📋 Total discovered API endpoints: {endpoints_count}")
    print(f"📚 Total scraped API documentations: {docs_count}")
    print(f"🏗️ Total API definitions extracted: {definitions_count}")
    print(f"🔧 Total definition properties extracted: {properties_count}")
    
    # OPTION 2 FIX: Show session-specific results
    session_endpoints_count = spark.sql(f"SELECT COUNT(*) as count FROM api_endpoints WHERE endpoint_id >= {session_starting_id}").collect()[0]['count']
    session_docs_count = spark.sql(f"""
        SELECT COUNT(*) as count FROM api_documentation d 
        JOIN api_endpoints e ON d.endpoint_id = e.endpoint_id 
        WHERE e.endpoint_id >= {session_starting_id}
    """).collect()[0]['count']
    
    print(f"\n🆕 THIS SESSION RESULTS:")
    print(f"   📋 Discovered endpoints: {session_endpoints_count}")
    print(f"   📚 Scraped documentations: {session_docs_count}")
    
    # Calculate success rates
    if endpoints_count > 0:
        doc_success_rate = (docs_count / endpoints_count) * 100
        print(f"✅ Overall documentation scraping success rate: {doc_success_rate:.1f}%")
    
    if session_endpoints_count > 0:
        session_success_rate = (session_docs_count / session_endpoints_count) * 100
        print(f"✅ This session scraping success rate: {session_success_rate:.1f}%")
    
    # Show breakdown by category
    print("\n📊 APIs by category:")
    spark.sql("""
        SELECT e.category, COUNT(DISTINCT e.endpoint_id) as endpoints,
               COUNT(DISTINCT d.doc_id) as documented,
               COUNT(DISTINCT def.definition_id) as definitions
        FROM api_endpoints e
        LEFT JOIN api_documentation d ON e.endpoint_id = d.endpoint_id
        LEFT JOIN api_definitions def ON d.doc_id = def.doc_id
        GROUP BY e.category
        ORDER BY endpoints DESC
    """).show()
    
    # Show object vs enum breakdown
    print("\n🏗️ Definitions by type:")
    spark.sql("""
        SELECT definition_type, COUNT(*) as count,
               AVG(CASE WHEN definition_type = 'Object' 
                   THEN (SELECT COUNT(*) FROM api_definitions_properties p WHERE p.definition_id = def.definition_id AND p.is_enum_value = false)
                   ELSE (SELECT COUNT(*) FROM api_definitions_properties p WHERE p.definition_id = def.definition_id AND p.is_enum_value = true)
               END) as avg_properties
        FROM api_definitions def
        WHERE is_error_type = false
        GROUP BY definition_type
        ORDER BY count DESC
    """).show()
    
    # Show most complex definitions
    print("\n🔧 Most complex definitions (by property count):")
    spark.sql("""
        SELECT def.definition_name, def.definition_type, COUNT(prop.property_id) as property_count,
               doc.operation_name
        FROM api_definitions def
        JOIN api_definitions_properties prop ON def.definition_id = prop.definition_id
        JOIN api_documentation doc ON def.doc_id = doc.doc_id
        WHERE def.is_error_type = false
        GROUP BY def.definition_name, def.definition_type, doc.operation_name
        ORDER BY property_count DESC
        LIMIT 10
    """).show(truncate=False)
    
    # Show sample of full relational data
    print("\n📄 Sample of complete relational data:")
    spark.sql("""
        SELECT e.category, e.service, e.operation_name,
               d.http_method, 
               def.definition_name, def.definition_type,
               COUNT(prop.property_id) as properties
        FROM api_endpoints e
        JOIN api_documentation d ON e.endpoint_id = d.endpoint_id
        JOIN api_definitions def ON d.doc_id = def.doc_id
        LEFT JOIN api_definitions_properties prop ON def.definition_id = prop.definition_id
        WHERE def.is_error_type = false
        GROUP BY e.category, e.service, e.operation_name, d.http_method, def.definition_name, def.definition_type
        ORDER BY e.category, e.service, e.operation_name
        LIMIT 15
    """).show(truncate=False)
    
    # ADDED: Show definitions starting with A-D to verify fix
    print("\n🔤 Definitions starting with A-D (to verify fix):")
    spark.sql("""
        SELECT def.definition_name, def.definition_type, COUNT(prop.property_id) as property_count
        FROM api_definitions def
        LEFT JOIN api_definitions_properties prop ON def.definition_id = prop.definition_id
        WHERE def.is_error_type = false 
        AND UPPER(SUBSTRING(def.definition_name, 1, 1)) IN ('A', 'B', 'C', 'D')
        GROUP BY def.definition_name, def.definition_type
        ORDER BY def.definition_name
    """).show(truncate=False)

def show_powerful_analytics_queries():
    """Show advanced analytics queries enabled by the relational design."""
    print("\n" + "="*70)
    print("🧠 POWERFUL ANALYTICS QUERIES FOR YOUR ENHANCED DATA")
    print("="*70)
    
    print("\n1️⃣ Find shared definitions across multiple APIs:")
    print("""
    SELECT def.definition_name, def.definition_type, 
           COUNT(DISTINCT e.service) as used_in_services,
           COUNT(DISTINCT d.doc_id) as used_in_apis
    FROM api_definitions def
    JOIN api_documentation d ON def.doc_id = d.doc_id
    JOIN api_endpoints e ON d.endpoint_id = e.endpoint_id
    WHERE def.is_error_type = false
    GROUP BY def.definition_name, def.definition_type
    HAVING COUNT(DISTINCT e.service) > 1
    ORDER BY used_in_services DESC, used_in_apis DESC
    """)
    
    print("\n2️⃣ Find APIs with the most complex object schemas:")
    print("""
    SELECT e.operation_name, e.category, COUNT(def.definition_id) as total_definitions,
           SUM(CASE WHEN def.definition_type = 'Object' THEN 1 ELSE 0 END) as objects,
           SUM(CASE WHEN def.definition_type = 'Enum' THEN 1 ELSE 0 END) as enums
    FROM api_endpoints e
    JOIN api_documentation d ON e.endpoint_id = d.endpoint_id
    JOIN api_definitions def ON d.doc_id = def.doc_id
    WHERE def.is_error_type = false
    GROUP BY e.operation_name, e.category
    ORDER BY total_definitions DESC
    """)
    
    print("\n3️⃣ Analyze enum complexity (values per enum):")
    print("""
    SELECT def.definition_name, COUNT(prop.property_id) as enum_values,
           e.service, d.operation_name
    FROM api_definitions def
    JOIN api_definitions_properties prop ON def.definition_id = prop.definition_id
    JOIN api_documentation d ON def.doc_id = d.doc_id
    JOIN api_endpoints e ON d.endpoint_id = e.endpoint_id
    WHERE def.definition_type = 'Enum' AND prop.is_enum_value = true
    GROUP BY def.definition_name, e.service, d.operation_name
    ORDER BY enum_values DESC
    """)
    
    print("\n4️⃣ Find object properties by type distribution:")
    print("""
    SELECT prop.property_type, COUNT(*) as usage_count,
           COUNT(DISTINCT def.definition_name) as used_in_objects
    FROM api_definitions_properties prop
    JOIN api_definitions def ON prop.definition_id = def.definition_id
    WHERE def.definition_type = 'Object' AND prop.is_enum_value = false
    GROUP BY prop.property_type
    ORDER BY usage_count DESC
    """)
    
    print("\n5️⃣ Full API coverage analysis:")
    print("""
    SELECT e.category, e.service,
           COUNT(e.endpoint_id) as total_endpoints,
           COUNT(d.doc_id) as documented_endpoints,
           COUNT(def.definition_id) as total_definitions,
           ROUND(COUNT(d.doc_id) * 100.0 / COUNT(e.endpoint_id), 1) as coverage_percent
    FROM api_endpoints e
    LEFT JOIN api_documentation d ON e.endpoint_id = d.endpoint_id
    LEFT JOIN api_definitions def ON d.doc_id = def.doc_id
    GROUP BY e.category, e.service
    ORDER BY coverage_percent DESC, total_endpoints DESC
    """)
    
    print("\n6️⃣ Explore specific object properties:")
    print("""
    SELECT def.definition_name, prop.property_name, prop.property_type, 
           prop.is_required, prop.property_description
    FROM api_definitions def
    JOIN api_definitions_properties prop ON def.definition_id = prop.definition_id
    WHERE def.definition_name = 'Workspace' AND prop.is_enum_value = false
    ORDER BY prop.property_order
    """)

def run_sample_analytics():
    """Run some sample analytics to demonstrate the power of the relational design."""
    print("\n" + "="*70)
    print("📊 SAMPLE ANALYTICS - DEMONSTRATING RELATIONAL POWER")
    print("="*70)
    
    print("\n🔄 Shared definitions across services:")
    spark.sql("""
        SELECT def.definition_name, def.definition_type, 
               COUNT(DISTINCT e.service) as used_in_services,
               COLLECT_SET(e.service) as services
        FROM api_definitions def
        JOIN api_documentation d ON def.doc_id = d.doc_id
        JOIN api_endpoints e ON d.endpoint_id = e.endpoint_id
        WHERE def.is_error_type = false
        GROUP BY def.definition_name, def.definition_type
        HAVING COUNT(DISTINCT e.service) > 1
        ORDER BY used_in_services DESC
        LIMIT 10
    """).show(truncate=False)
    
    print("\n🏗️ Most complex APIs by definition count:")
    spark.sql("""
        SELECT e.operation_name, e.category, e.service,
               COUNT(def.definition_id) as definition_count,
               SUM(CASE WHEN def.definition_type = 'Object' THEN 1 ELSE 0 END) as objects,
               SUM(CASE WHEN def.definition_type = 'Enum' THEN 1 ELSE 0 END) as enums
        FROM api_endpoints e
        JOIN api_documentation d ON e.endpoint_id = d.endpoint_id
        JOIN api_definitions def ON d.doc_id = def.doc_id
        WHERE def.is_error_type = false
        GROUP BY e.operation_name, e.category, e.service
        ORDER BY definition_count DESC
        LIMIT 10
    """).show(truncate=False)
    
    print("\n🔧 Property type distribution:")
    spark.sql("""
        SELECT prop.property_type, COUNT(*) as count,
               COUNT(DISTINCT def.definition_name) as used_in_definitions
        FROM api_definitions_properties prop
        JOIN api_definitions def ON prop.definition_id = def.definition_id
        WHERE def.definition_type = 'Object' AND prop.is_enum_value = false
        GROUP BY prop.property_type
        ORDER BY count DESC
        LIMIT 15
    """).show(truncate=False)

# Show comprehensive results
show_enhanced_scraping_results()
show_powerful_analytics_queries()
run_sample_analytics()

# =============================================================================
# FINAL ENHANCED SUMMARY
# =============================================================================
print("\n" + "="*70)
print("🚀 ENHANCED MICROSOFT FABRIC API SCRAPER - APPEND MODE COMPLETE!")
print("="*70)
print("✅ Four properly normalized Delta tables (append-only mode):")
print("   📋 api_endpoints - All discovered API endpoints (with endpoint_id PK)")
print("   📚 api_documentation - Essential API specs (with doc_id PK, endpoint_id FK)")
print("   🏗️ api_definitions - API object/enum definitions (with definition_id PK, doc_id FK)")
print("   🔧 api_definitions_properties - Property details (with property_id PK, definition_id FK)")
print("\n🎯 Key Features:")
print("   ⚡ Integer primary/foreign keys for fast joins")
print("   🚫 Automatic Error* object exclusion")
print("   📊 Object vs Enum type classification")
print("   🔄 Proper relational integrity")
print("   📅 Timestamp tracking for all data")
print("   🔧 FIXED: A-D definitions and properties now captured correctly")
print("   🛠️ FIXED: Property linking issues resolved with proper copying")
print("   ➕ NEW: Append-only mode - tables are never dropped or truncated")
print("   🔄 NEW: Tables created only if they don't exist")
print("   🏷️ NEW: Renamed table to 'api_definitions_properties'")
print("   🎯 FIXED: Session-based processing - only scrapes newly discovered APIs")
print("\n🧠 Analytics Ready:")
print("   🔍 Cross-API definition analysis")
print("   📈 Schema complexity metrics")
print("   🎯 Property usage patterns")
print("   📊 API coverage tracking")
print("\n💾 All data appended to your Microsoft Fabric Lakehouse!")
print("🔧 Ready for continuous data collection and advanced analytics!")
print("="*70)

StatementMeta(, 5fd197e0-1262-412d-b4a3-bb303a830069, 3, Finished, Available, Finished)

✅ All libraries imported successfully!
📋 Configured 5 overview URLs to process
   1. https://learn.microsoft.com/en-us/rest/api/power-bi/Scorecards(Preview)
   2. https://learn.microsoft.com/en-us/rest/api/power-bi/Goal-Notes(Preview)
   3. https://learn.microsoft.com/en-us/rest/api/power-bi/Goal-Values(Preview)
   4. https://learn.microsoft.com/en-us/rest/api/power-bi/Goals(Preview)
   5. https://learn.microsoft.com/en-us/rest/api/power-bi/Goals-Status-Rules(Preview)
✅ Helper functions defined!
🗃️ Setting up enhanced Delta tables with proper relationships...
✅ api_endpoints table already exists, skipping creation
✅ api_documentation table already exists, skipping creation
✅ api_definitions table already exists, skipping creation
✅ api_definitions_properties table already exists, skipping creation
✅ Enhanced Delta tables setup complete!

📋 Table schemas:

api_endpoints:
+--------------------+---------+-------+
|            col_name|data_type|comment|
+--------------------+---------+---