In [1]:
# DO NOT DELETE THIS CELL

# API Name: 
# Command:  
# Doc:      

# Loads table: api_discovery
# Loads table: api_documentation

StatementMeta(, e2c2929e-68d0-4eb9-bb28-93801e523339, 3, Finished, Available, Finished)

In [2]:
# =============================================================================
# MICROSOFT FABRIC REST API DOCUMENTATION SCRAPER - CLEAN VERSION
# =============================================================================
# This notebook scrapes Microsoft Fabric REST API documentation and stores
# the results in Delta tables in your Microsoft Fabric Lakehouse.
#
# What this does:
# 1. Discovers API endpoints from overview pages
# 2. Scrapes detailed documentation for each endpoint
# 3. Stores everything in structured Delta tables for analysis
# =============================================================================

# =============================================================================
# BLOCK 1: IMPORTS AND SETUP
# =============================================================================
import requests
import time
import json
import re
import logging
import pytz
from datetime import datetime
from typing import List, Dict, Optional, Tuple
from urllib.parse import urljoin, urlparse
from bs4 import BeautifulSoup
from pyspark.sql import SparkSession
from pyspark.sql.types import *
from pyspark.sql.functions import *

# Set up logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)

# Get local timezone (Houston, Texas - Central Time)
LOCAL_TIMEZONE = pytz.timezone('America/Chicago')

def get_local_timestamp():
    """Get current timestamp in local timezone."""
    utc_now = datetime.utcnow().replace(tzinfo=pytz.UTC)
    local_time = utc_now.astimezone(LOCAL_TIMEZONE)
    return local_time.replace(tzinfo=None)

print("✅ All libraries imported successfully!")

# =============================================================================
# BLOCK 2: CONFIGURATION - SUPPLY YOUR OVERVIEW URLS HERE
# =============================================================================
# 🎯 THIS IS WHERE YOU SUPPLY YOUR OVERVIEW URLS!

OVERVIEW_URLS = [
    # Admin APIs
    "https://learn.microsoft.com/en-us/rest/api/fabric/admin/domains",
    "https://learn.microsoft.com/en-us/rest/api/fabric/admin/external-data-shares-provider",

]

print(f"📋 Configured {len(OVERVIEW_URLS)} overview URLs to process")
for i, url in enumerate(OVERVIEW_URLS, 1):
    print(f"   {i}. {url}")

# =============================================================================
# BLOCK 3: HELPER FUNCTIONS
# =============================================================================

def make_safe_request(session, url: str, max_retries: int = 3) -> Optional[requests.Response]:
    """Safely make an HTTP request with retry logic."""
    for attempt in range(max_retries):
        try:
            print(f"   📡 Fetching: {url} (attempt {attempt + 1})")
            response = session.get(url, timeout=30)
            
            if response.status_code == 200:
                print(f"   ✅ Success!")
                return response
            elif response.status_code == 429:
                wait_time = 2 ** attempt * 3
                print(f"   ⏳ Rate limited. Waiting {wait_time} seconds...")
                time.sleep(wait_time)
            else:
                print(f"   ❌ HTTP {response.status_code}")
                
        except Exception as e:
            print(f"   ❌ Error: {e}")
            if attempt < max_retries - 1:
                time.sleep(2 ** attempt)
    
    return None

def clean_description_text(description: str) -> str:
    """Clean description text by removing unwanted content."""
    if not description:
        return ""
    
    # Remove common preview notices and note prefixes
    unwanted_patterns = [
        r'Note\s*This API is in preview\.?',
        r'NoteThis API is in preview\.?',
        r'Note\s*',
        r'This API is in preview\.?',
        r'Preview\s*',
        r'^\s*Note\s*',
    ]
    
    cleaned = description
    for pattern in unwanted_patterns:
        cleaned = re.sub(pattern, '', cleaned, flags=re.IGNORECASE)
    
    # Clean up extra whitespace
    cleaned = re.sub(r'\s+', ' ', cleaned).strip()
    
    # If the description is now empty or too short, return empty
    if len(cleaned) < 10:
        return ""
    
    return cleaned

def extract_api_operations_from_page(html_content: str, overview_url: str) -> List[Dict]:
    """Extract individual API operations from an overview page."""
    soup = BeautifulSoup(html_content, 'html.parser')
    operations = []
    
    # Look for the operations table
    tables = soup.find_all('table')
    
    for table in tables:
        rows = table.find_all('tr')
        
        for row in rows:
            cells = row.find_all('td')
            if len(cells) >= 2:
                
                # First cell should contain the operation link
                link_cell = cells[0]
                operation_link = link_cell.find('a', href=lambda x: x and not x.startswith('http'))
                
                if not operation_link:
                    continue
                
                href = operation_link.get('href', '')
                operation_name = operation_link.get_text(strip=True)
                
                # Filter for actual API operation links
                if not (href and '/' in href and not href.startswith('#') and 
                       not 'articles' in href and not 'quickstart' in href):
                    continue
                
                if not operation_name:
                    continue
                
                # Second cell contains description - extract from <p> tag, ignore NOTE divs
                desc_cell = cells[1]
                
                # Remove all NOTE divs from the cell before extracting description
                for note_div in desc_cell.find_all('div', class_='NOTE'):
                    note_div.decompose()
                
                # Now find the <p> tag with the actual description
                description = ""
                p_tag = desc_cell.find('p')
                if p_tag:
                    description = clean_description_text(p_tag.get_text(strip=True))
                
                # Build the full URL for this operation
                operation_url = urljoin(overview_url, href)
                
                # Extract category and service from the URL
                url_parts = overview_url.split('/')
                category = url_parts[-2] if len(url_parts) >= 2 else "unknown"
                service = url_parts[-1] if len(url_parts) >= 1 else "unknown"
                
                operation = {
                    'category': category,
                    'overview_url': overview_url,
                    'service': service,
                    'operation_name': operation_name,
                    'operation_desc': description,
                    'operation_url': operation_url,
                    'discovered_timestamp': get_local_timestamp()
                }
                operations.append(operation)
    
    # Fallback: if no table found, try alternative method
    if not operations:
        print("   ⚠️ No table found, trying alternative extraction method...")
        operations = extract_operations_fallback_method(soup, overview_url)
    
    return operations

def extract_operations_fallback_method(soup: BeautifulSoup, overview_url: str) -> List[Dict]:
    """Fallback method for pages that don't use the standard table format."""
    operations = []
    
    # Find all links that point to API operations
    operation_links = soup.find_all('a', href=lambda x: x and not x.startswith('http'))
    
    for link in operation_links:
        href = link.get('href', '')
        
        # Filter for actual API operation links
        if (href and '/' in href and not href.startswith('#') and 
            not 'articles' in href and not 'quickstart' in href):
            
            operation_name = link.get_text(strip=True)
            if not operation_name:
                continue
            
            # Try to find description in nearby content
            description = ""
            
            # Look for description in parent row or container
            parent_row = link.find_parent('tr')
            if parent_row:
                cells = parent_row.find_all('td')
                for cell in cells:
                    # Remove NOTE divs
                    for note_div in cell.find_all('div', class_='NOTE'):
                        note_div.decompose()
                    
                    # Find <p> tag
                    p_tag = cell.find('p')
                    if p_tag:
                        desc_text = p_tag.get_text(strip=True)
                        if desc_text and len(desc_text) > 10:
                            description = clean_description_text(desc_text)
                            break
            
            # Build the full URL for this operation
            operation_url = urljoin(overview_url, href)
            
            # Extract category and service from the URL
            url_parts = overview_url.split('/')
            category = url_parts[-2] if len(url_parts) >= 2 else "unknown"
            service = url_parts[-1] if len(url_parts) >= 1 else "unknown"
            
            operation = {
                'category': category,
                'overview_url': overview_url,
                'service': service,
                'operation_name': operation_name,
                'operation_desc': description,
                'operation_url': operation_url,
                'discovered_timestamp': get_local_timestamp()
            }
            operations.append(operation)
    
    return operations

print("✅ Helper functions defined!")

# =============================================================================
# BLOCK 4: DELTA TABLE SETUP
# =============================================================================

def initialize_delta_tables():
    """Create the Delta tables if they don't exist."""
    print("🗃️ Setting up Delta tables...")
    
    # Schema for the API discovery table
    discovery_schema = StructType([
        StructField("category", StringType(), True),
        StructField("overview_url", StringType(), True),
        StructField("service", StringType(), True),
        StructField("operation_name", StringType(), True),
        StructField("operation_desc", StringType(), True),
        StructField("operation_url", StringType(), True),
        StructField("discovered_timestamp", TimestampType(), True)
    ])
    
    # Schema for parameter information
    param_schema = StructType([
        StructField("name", StringType(), True),
        StructField("type", StringType(), True),
        StructField("required", BooleanType(), True),
        StructField("description", StringType(), True)
    ])
    
    # Schema for the API documentation table (with discovered_timestamp added)
    documentation_schema = StructType([
        StructField("source_url", StringType(), True),
        StructField("api_category", StringType(), True),
        StructField("operation_name", StringType(), True),
        StructField("description", StringType(), True),
        StructField("http_method", StringType(), True),
        StructField("endpoint_url", StringType(), True),
        StructField("parameters", ArrayType(param_schema), True),
        StructField("discovered_timestamp", TimestampType(), True)  # Added this column
    ])
    
    # Check if api_documentation table exists and recreate if needed
    try:
        # Try to query the table to see if it exists
        existing_count = spark.sql("SELECT COUNT(*) as count FROM api_documentation").collect()[0]['count']
        print(f"📊 api_documentation table exists with {existing_count} records")
        
        # Check if the table has the correct schema by trying to select the new column
        try:
            spark.sql("SELECT discovered_timestamp FROM api_documentation LIMIT 1").collect()
            print("✅ Table has correct schema with discovered_timestamp column")
            table_needs_recreation = False
        except:
            print("⚠️ Table exists but missing discovered_timestamp column - will recreate")
            table_needs_recreation = True
            
    except:
        print("📊 api_documentation table doesn't exist - will create new one")
        table_needs_recreation = True
    
    # Recreate api_documentation table if needed
    if table_needs_recreation:
        print("🗑️ Dropping existing api_documentation table (if exists)...")
        spark.sql("DROP TABLE IF EXISTS api_documentation")
        
        print("🏗️ Creating new api_documentation table with discovered_timestamp column...")
        empty_documentation = spark.createDataFrame([], documentation_schema)
        empty_documentation.write.format("delta").mode("overwrite").saveAsTable("api_documentation")
        print("✅ New api_documentation table created!")
    
    # Create discovery table (only if it doesn't exist)
    empty_discovery = spark.createDataFrame([], discovery_schema)
    empty_discovery.write.format("delta").mode("ignore").saveAsTable("api_discovery")
    
    print("✅ Delta tables ready!")
    
    # Show the current schema
    print("\n📋 api_documentation table schema:")
    spark.sql("DESCRIBE api_documentation").show()

# Run the table setup
initialize_delta_tables()

# =============================================================================
# BLOCK 5: PHASE 1 - API DISCOVERY
# =============================================================================

def discover_all_api_endpoints(overview_urls: List[str]) -> List[Dict]:
    """Phase 1: Discover all API endpoints from overview pages."""
    print("\n🔍 PHASE 1: DISCOVERING API ENDPOINTS")
    print("="*50)
    
    # Set up a web session for making requests
    session = requests.Session()
    session.headers.update({
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36',
        'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8'
    })
    
    all_discovered_endpoints = []
    
    for i, overview_url in enumerate(overview_urls, 1):
        print(f"\n📋 Processing overview page {i}/{len(overview_urls)}")
        print(f"   URL: {overview_url}")
        
        # Fetch the overview page
        response = make_safe_request(session, overview_url)
        if not response:
            print(f"   ❌ Failed to fetch overview page")
            continue
        
        try:
            # Extract API operations from this page
            endpoints = extract_api_operations_from_page(response.text, overview_url)
            all_discovered_endpoints.extend(endpoints)
            print(f"   ✅ Found {len(endpoints)} API operations")
            
            # Show what we found
            for endpoint in endpoints[:3]:
                print(f"      - {endpoint['operation_name']}")
            if len(endpoints) > 3:
                print(f"      - ... and {len(endpoints) - 3} more")
                
        except Exception as e:
            print(f"   ❌ Error processing page: {e}")
        
        # Be polite - wait between requests
        time.sleep(1.5)
    
    print(f"\n🎉 Discovery complete! Found {len(all_discovered_endpoints)} total API endpoints")
    return all_discovered_endpoints

# Run the discovery
discovered_endpoints = discover_all_api_endpoints(OVERVIEW_URLS)

# =============================================================================
# BLOCK 6: SAVE DISCOVERED ENDPOINTS
# =============================================================================

def save_discovered_endpoints_to_table(endpoints: List[Dict]):
    """Save discovered endpoints to the api_discovery Delta table."""
    if not endpoints:
        print("❌ No endpoints to save")
        return
    
    print(f"\n💾 Saving {len(endpoints)} discovered endpoints to Delta table...")
    
    # Clean the data before saving
    cleaned_endpoints = []
    for endpoint in endpoints:
        cleaned_endpoint = endpoint.copy()
        if 'operation_desc' in cleaned_endpoint:
            cleaned_endpoint['operation_desc'] = clean_description_text(cleaned_endpoint['operation_desc'])
        cleaned_endpoints.append(cleaned_endpoint)
    
    # Convert to DataFrame and save
    endpoints_df = spark.createDataFrame(cleaned_endpoints)
    endpoints_df.write.format("delta").mode("append").saveAsTable("api_discovery")
    
    print("✅ Endpoints saved to 'api_discovery' table")
    
    # Show a preview
    print("\n📊 Sample of discovered endpoints (with cleaned descriptions):")
    spark.sql("""
        SELECT category, service, operation_name, 
               CASE 
                   WHEN LENGTH(operation_desc) > 50 
                   THEN CONCAT(SUBSTRING(operation_desc, 1, 50), '...') 
                   ELSE operation_desc 
               END as description_preview
        FROM api_discovery 
        WHERE operation_desc IS NOT NULL AND operation_desc != ''
        ORDER BY category, service, operation_name 
        LIMIT 10
    """).show(truncate=False)

# Save the discovered endpoints
save_discovered_endpoints_to_table(discovered_endpoints)

# =============================================================================
# BLOCK 7: DETAILED CONTENT EXTRACTION FUNCTIONS
# =============================================================================

def extract_description_from_api_page(soup: BeautifulSoup) -> str:
    """Extract the main description of what this API does."""
    description_selectors = [
        'p:first-of-type',
        '.description',
        'div[class*="summary"] p'
    ]
    
    for selector in description_selectors:
        elements = soup.select(selector)
        for elem in elements:
            text = elem.get_text(strip=True)
            if text and len(text) > 20:
                return text[:1000]
    return ""

def extract_http_method_and_endpoint(soup: BeautifulSoup) -> Tuple[str, str]:
    """Extract the HTTP method and API endpoint URL."""
    http_method = ""
    endpoint_url = ""
    
    # Look in code blocks for HTTP method patterns
    code_blocks = soup.find_all(['code', 'pre'])
    
    for block in code_blocks:
        text = block.get_text(strip=True)
        
        # Look for HTTP method
        method_match = re.search(r'\b(GET|POST|PUT|DELETE|PATCH)\s+', text)
        if method_match:
            http_method = method_match.group(1)
            
            # Look for the API URL in the same block
            url_match = re.search(r'https://api\.fabric\.microsoft\.com[^\s\n]+', text)
            if url_match:
                endpoint_url = url_match.group(0)
                break
    
    return http_method, endpoint_url

def extract_parameters_from_tables(soup: BeautifulSoup) -> List[Dict]:
    """Extract parameter information from documentation tables."""
    parameters = []
    tables = soup.find_all('table')
    
    for table in tables:
        headers = table.find_all('th')
        if not headers:
            continue
        
        header_texts = [h.get_text(strip=True).lower() for h in headers]
        
        # Check if this looks like a parameters table
        if 'name' in header_texts and ('type' in header_texts or 'description' in header_texts):
            rows = table.find_all('tr')[1:]  # Skip header row
            
            for row in rows:
                cells = row.find_all(['td', 'th'])
                if len(cells) >= 2:
                    param_name = cells[0].get_text(strip=True) if len(cells) > 0 else ""
                    param_type = cells[1].get_text(strip=True) if len(cells) > 1 else ""
                    
                    # Determine if required
                    required_text = row.get_text().lower()
                    is_required = ('required' in required_text or 'true' in required_text)
                    
                    param_desc = cells[-1].get_text(strip=True)[:500] if len(cells) > 2 else ""
                    
                    if param_name:
                        parameters.append({
                            'name': param_name,
                            'type': param_type,
                            'required': is_required,
                            'description': param_desc
                        })
    
    return parameters

print("✅ Content extraction functions ready!")

# =============================================================================
# BLOCK 8: PHASE 2 - DETAILED API DOCUMENTATION SCRAPING
# =============================================================================

def scrape_single_api_documentation(url: str, session: requests.Session) -> Optional[Dict]:
    """Scrape essential documentation from a single API page."""
    response = make_safe_request(session, url)
    if not response:
        return None
    
    soup = BeautifulSoup(response.text, 'html.parser')
    
    # Extract essential information only
    doc_data = {
        'source_url': url
    }
    
    # Get the operation name from URL
    url_parts = url.split('/')
    doc_data['operation_name'] = url_parts[-1].replace('-', ' ').title()
    doc_data['api_category'] = url_parts[-3] if len(url_parts) >= 3 else ""
    
    # Extract essential information
    doc_data['description'] = extract_description_from_api_page(soup)
    
    http_method, endpoint_url = extract_http_method_and_endpoint(soup)
    doc_data['http_method'] = http_method
    doc_data['endpoint_url'] = endpoint_url
    
    # Extract parameters
    doc_data['parameters'] = extract_parameters_from_tables(soup)
    
    return doc_data

def clean_document_data(doc_data: Dict) -> Dict:
    """Clean document data to ensure schema compliance."""
    cleaned = {
        'source_url': str(doc_data.get('source_url', '')),
        'api_category': str(doc_data.get('api_category', '')),
        'operation_name': str(doc_data.get('operation_name', '')),
        'description': str(doc_data.get('description', '')),
        'http_method': str(doc_data.get('http_method', '')),
        'endpoint_url': str(doc_data.get('endpoint_url', '')),
        'discovered_timestamp': get_local_timestamp()  # Add current local timestamp
    }
    
    # Handle parameters carefully
    params = doc_data.get('parameters', [])
    if not isinstance(params, list):
        params = []
    
    clean_params = []
    for param in params:
        if isinstance(param, dict):
            clean_param = {
                'name': str(param.get('name', '')),
                'type': str(param.get('type', '')),
                'required': bool(param.get('required', False)),
                'description': str(param.get('description', ''))
            }
            clean_params.append(clean_param)
    
    cleaned['parameters'] = clean_params
    return cleaned

def save_clean_batch(batch_data: List[Dict]) -> bool:
    """Save batch with the updated schema including discovered_timestamp."""
    try:
        # Define schema with discovered_timestamp
        param_schema = StructType([
            StructField("name", StringType(), True),
            StructField("type", StringType(), True),
            StructField("required", BooleanType(), True),
            StructField("description", StringType(), True)
        ])
        
        documentation_schema = StructType([
            StructField("source_url", StringType(), True),
            StructField("api_category", StringType(), True),
            StructField("operation_name", StringType(), True),
            StructField("description", StringType(), True),
            StructField("http_method", StringType(), True),
            StructField("endpoint_url", StringType(), True),
            StructField("parameters", ArrayType(param_schema), True),
            StructField("discovered_timestamp", TimestampType(), True)  # Added column
        ])
        
        # Create DataFrame with explicit schema
        df = spark.createDataFrame(batch_data, schema=documentation_schema)
        
        # Save to table
        df.write.format("delta").mode("append").saveAsTable("api_documentation")
        
        return True
        
    except Exception as e:
        print(f"❌ Save error: {e}")
        return False

def scrape_all_discovered_apis(batch_size: int = 5) -> int:
    """Phase 2: Scrape detailed documentation for all discovered APIs."""
    print("\n📚 PHASE 2: SCRAPING DETAILED API DOCUMENTATION")
    print("="*50)
    
    # Get URLs that we haven't scraped yet
    unprocessed_df = spark.sql("""
        SELECT DISTINCT d.operation_url, d.category, d.service, d.operation_name
        FROM api_discovery d
        LEFT JOIN api_documentation doc ON d.operation_url = doc.source_url
        WHERE doc.source_url IS NULL
        ORDER BY d.category, d.service, d.operation_name
    """)
    
    unprocessed_urls = unprocessed_df.collect()
    total_urls = len(unprocessed_urls)
    
    if total_urls == 0:
        print("✅ All discovered APIs have already been scraped!")
        return 0
    
    print(f"📋 Found {total_urls} APIs to scrape")
    
    # Set up web session
    session = requests.Session()
    session.headers.update({
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
    })
    
    scraped_count = 0
    batch_data = []
    
    for i, row in enumerate(unprocessed_urls):
        url = row['operation_url']
        operation_name = row['operation_name']
        
        print(f"\n📄 Scraping {i+1}/{total_urls}: {operation_name}")
        print(f"   URL: {url}")
        
        try:
            doc_data = scrape_single_api_documentation(url, session)
            if doc_data:
                cleaned_data = clean_document_data(doc_data)
                batch_data.append(cleaned_data)
                scraped_count += 1
                print(f"   ✅ Successfully scraped!")
                
                # Show what we extracted
                if cleaned_data['http_method'] and cleaned_data['endpoint_url']:
                    print(f"      Method: {cleaned_data['http_method']}")
                    print(f"      Endpoint: {cleaned_data['endpoint_url'][:80]}...")
            else:
                print(f"   ❌ Failed to scrape")
            
            # Save batch when it's full
            if len(batch_data) >= batch_size:
                success = save_clean_batch(batch_data)
                if success:
                    print(f"   💾 Batch of {len(batch_data)} saved!")
                batch_data = []
                
        except Exception as e:
            print(f"   ❌ Error: {e}")
        
        # Be polite - wait between requests
        time.sleep(2.0)
    
    # Save any remaining data
    if batch_data:
        success = save_clean_batch(batch_data)
        if success:
            print(f"   💾 Final batch of {len(batch_data)} saved!")
    
    print(f"\n🎉 Scraping complete! Successfully scraped {scraped_count}/{total_urls} APIs")
    return scraped_count

# Run the detailed scraping
scraped_count = scrape_all_discovered_apis(batch_size=5)

# =============================================================================
# BLOCK 9: RESULTS AND SUMMARY
# =============================================================================

def show_scraping_results():
    """Display a summary of what we scraped."""
    print("\n" + "="*60)
    print("🎉 SCRAPING COMPLETE - RESULTS SUMMARY")
    print("="*60)
    
    # Count discovered endpoints
    discovery_count = spark.sql("SELECT COUNT(*) as count FROM api_discovery").collect()[0]['count']
    print(f"📋 Total discovered API endpoints: {discovery_count}")
    
    # Count scraped documentation
    doc_count = spark.sql("SELECT COUNT(*) as count FROM api_documentation").collect()[0]['count']
    print(f"📚 Total scraped API documents: {doc_count}")
    
    # Calculate success rate
    if discovery_count > 0:
        success_rate = (doc_count / discovery_count) * 100
        print(f"✅ Success rate: {success_rate:.1f}%")
    
    # Show breakdown by category
    print("\n📊 APIs by category:")
    spark.sql("""
        SELECT api_category, COUNT(*) as count 
        FROM api_documentation 
        GROUP BY api_category 
        ORDER BY count DESC
    """).show()
    
    # Show some examples of what we scraped (including discovered_timestamp)
    print("\n📄 Sample of scraped APIs:")
    spark.sql("""
        SELECT operation_name, http_method, 
               CASE 
                   WHEN LENGTH(endpoint_url) > 50 
                   THEN CONCAT(SUBSTRING(endpoint_url, 1, 50), '...') 
                   ELSE endpoint_url 
               END as endpoint_preview,
               discovered_timestamp
        FROM api_documentation 
        WHERE http_method IS NOT NULL AND http_method != ''
        ORDER BY api_category, operation_name
        LIMIT 10
    """).show(truncate=False)

def show_useful_queries():
    """Show some useful SQL queries for analyzing the scraped data."""
    print("\n" + "="*60)
    print("📝 USEFUL QUERIES FOR YOUR SIMPLIFIED DATA")
    print("="*60)
    
    print("\n1️⃣ See all GET endpoints:")
    print("   SELECT operation_name, endpoint_url FROM api_documentation WHERE http_method = 'GET'")
    
    print("\n2️⃣ Find APIs with parameters:")
    print("   SELECT operation_name, SIZE(parameters) as param_count FROM api_documentation WHERE SIZE(parameters) > 0")
    
    print("\n3️⃣ See all discovered but not yet scraped APIs:")
    print("""   SELECT d.operation_name, d.operation_url 
   FROM api_discovery d 
   LEFT JOIN api_documentation doc ON d.operation_url = doc.source_url 
   WHERE doc.source_url IS NULL""")
    
    print("\n4️⃣ Count APIs by HTTP method:")
    print("   SELECT http_method, COUNT(*) FROM api_documentation GROUP BY http_method")
    
    print("\n5️⃣ Find APIs with descriptions:")
    print("   SELECT operation_name FROM api_documentation WHERE description != '' AND description IS NOT NULL")
    
    print("\n6️⃣ View parameter details for specific API:")
    print("   SELECT operation_name, EXPLODE(parameters) as param FROM api_documentation WHERE operation_name LIKE '%Create%'")

# Show the results
show_scraping_results()
show_useful_queries()

# =============================================================================
# FINAL MESSAGE
# =============================================================================
print("\n" + "="*60)
print("🔧 UPDATED SCHEMA:")
print("✅ source_url")
print("✅ api_category") 
print("✅ operation_name")
print("✅ description")
print("✅ http_method")
print("✅ endpoint_url")
print("✅ parameters (array)")
print("✅ discovered_timestamp (NEW - local time when API was scraped)")
print("\n🚀 ALL DONE!")
print("="*60)
print("Your Microsoft Fabric REST API documentation has been scraped and stored in:")
print("📋 Table 'api_discovery' - All discovered API endpoints")
print("📚 Table 'api_documentation' - Essential API specifications with timestamps")
print("\nYou can now query these tables to analyze the APIs!")
print("📅 The discovered_timestamp column shows when each API was scraped (local time)")
print("="*60)

StatementMeta(, e2c2929e-68d0-4eb9-bb28-93801e523339, 4, Finished, Available, Finished)

✅ All libraries imported successfully!
📋 Configured 2 overview URLs to process
   1. https://learn.microsoft.com/en-us/rest/api/fabric/admin/domains
   2. https://learn.microsoft.com/en-us/rest/api/fabric/admin/external-data-shares-provider
✅ Helper functions defined!
🗃️ Setting up Delta tables...
📊 api_documentation table doesn't exist - will create new one
🗑️ Dropping existing api_documentation table (if exists)...
🏗️ Creating new api_documentation table with discovered_timestamp column...
✅ New api_documentation table created!
✅ Delta tables ready!

📋 api_documentation table schema:
+--------------------+--------------------+-------+
|            col_name|           data_type|comment|
+--------------------+--------------------+-------+
|          source_url|              string|   NULL|
|        api_category|              string|   NULL|
|      operation_name|              string|   NULL|
|         description|              string|   NULL|
|         http_method|              string|