In [2]:
import re
from urllib.parse import urljoin
from playwright.async_api import async_playwright

URL = "https://www.kaggle.com/competitions?tagIds=14101-Tabular"
COMP_RE = re.compile(r"^/competitions/([^/?#]+)$")

async def fetch_competition_links(url: str, scroll_rounds: int = 25, pause_ms: int = 800):
    async with async_playwright() as p:
        browser = await p.chromium.launch(headless=True)
        page = await browser.new_page()
        await page.goto(url, wait_until="networkidle")

        for _ in range(scroll_rounds):
            await page.evaluate("window.scrollTo(0, document.body.scrollHeight)")
            await page.wait_for_timeout(pause_ms)

        hrefs = await page.eval_on_selector_all(
            "a[href]",
            "els => els.map(e => e.getAttribute('href'))"
        )
        await browser.close()

    slugs = set()
    for href in hrefs:
        if not href:
            continue
        href = href.strip()
        m = COMP_RE.match(href)
        if m:
            slugs.add(m.group(1))

    return [f"https://www.kaggle.com/competitions/{s}" for s in sorted(slugs)]

links = await fetch_competition_links(URL, scroll_rounds=10)
print(len(links))
print("\n".join(links[:20]))

20
https://www.kaggle.com/competitions/bigquery-ai-hackathon
https://www.kaggle.com/competitions/cafa-6-protein-function-prediction
https://www.kaggle.com/competitions/google-gemma-3n-hackathon
https://www.kaggle.com/competitions/hull-tactical-market-prediction
https://www.kaggle.com/competitions/jane-street-real-time-market-data-forecasting
https://www.kaggle.com/competitions/meta-kaggle-hackathon
https://www.kaggle.com/competitions/playground-series-s4e11
https://www.kaggle.com/competitions/playground-series-s4e12
https://www.kaggle.com/competitions/playground-series-s5e1
https://www.kaggle.com/competitions/playground-series-s5e10
https://www.kaggle.com/competitions/playground-series-s5e11
https://www.kaggle.com/competitions/playground-series-s5e12
https://www.kaggle.com/competitions/playground-series-s5e2
https://www.kaggle.com/competitions/playground-series-s5e3
https://www.kaggle.com/competitions/playground-series-s5e4
https://www.kaggle.com/competitions/playground-series-s5e5
htt

In [9]:
import time
import undetected_chromedriver as uc
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

def setup_driver():
    """Sets up the undetected_chromedriver."""
    options = uc.ChromeOptions()
    
    # Kaggle/Cloudflare is very sensitive to headless mode. 
    # It is highly recommended to run this WITHOUT headless first.
    # If you must use headless, uncomment the line below:
    # options.add_argument("--headless=new") 

    # undetected_chromedriver handles the driver installation and patching automatically.
    # We do not need webdriver_manager or Service objects here.
    driver = uc.Chrome(options=options)
    return driver

def get_tab_content(driver, base_url, tab_name):
    """Navigates to a specific tab and extracts the main text content."""
    tab_url = f"{base_url}/{tab_name.lower()}"
    print(f"Crawling: {tab_url}...")
    driver.get(tab_url)

    try:
        wait = WebDriverWait(driver, 30) # Increased wait time for Cloudflare checks
        
        # 1. Wait for body to ensure page load
        wait.until(EC.presence_of_element_located((By.TAG_NAME, "body")))
        
        # 2. Check if we are stuck on the "Checking your browser" screen
        title = driver.title
        if "reCAPTCHA" in title or "Checking your browser" in title:
            print(f"  -> Detected Cloudflare challenge on {tab_name}. Waiting for redirect...")
            # Wait longer for the redirect to complete automatically
            time.sleep(10) 
        
        # 3. Try to find the main content
        try:
            # Kaggle content is usually in <main> or a specific div structure
            main_element = wait.until(EC.presence_of_element_located((By.TAG_NAME, "main")))
        except:
            print(f"  -> Warning: <main> tag not found for {tab_name}, attempting fallback.")
            # Fallback: sometimes content is in #site-content or just body if structure varies
            main_element = driver.find_element(By.TAG_NAME, "body")
        
        # 4. Allow dynamic content to render
        time.sleep(5)
        
        content = main_element.text
        
        # Validation: If content is still the "Checking browser" text, we failed.
        if "Checking your browser" in content or len(content) < 200:
            return f"Error: Failed to bypass Cloudflare protection. Page content: {content[:100]}..."

        cleaned_content = "\n".join([line for line in content.split('\n') if line.strip()])
        return cleaned_content

    except Exception as e:
        return f"Error extracting content for {tab_name}: {str(e)}"

base_url = "https://www.kaggle.com/competitions/playground-series-s5e12"
tabs = ["Overview", "Data", "Rules"]

driver = setup_driver()

results = {}

try:
    for tab in tabs:
        content = get_tab_content(driver, base_url, tab)
        results[tab] = content
        
        if content.startswith("Error"):
                print(f"--- Failed to extract {tab} ---")
                print(content)
        else:
                print(f"--- Successfully extracted {len(content)} characters from {tab} ---")
        print("-" * 30)
        
finally:
    driver.quit()

# Displaying a snippet of the results
for tab, content in results.items():
    print(f"\n=== {tab.upper()} CONTENT (Snippet) ===")
    print(content[:500] + "...\n[Content Truncated]")



Crawling: https://www.kaggle.com/competitions/playground-series-s5e12/overview...
--- Successfully extracted 4189 characters from Overview ---
------------------------------
Crawling: https://www.kaggle.com/competitions/playground-series-s5e12/data...
--- Successfully extracted 1909 characters from Data ---
------------------------------
Crawling: https://www.kaggle.com/competitions/playground-series-s5e12/rules...
--- Successfully extracted 34000 characters from Rules ---
------------------------------

=== OVERVIEW CONTENT (Snippet) ===
menu
Create
explore
Home
emoji_events
Competitions
table_chart
Datasets
tenancy
Models
leaderboard
Benchmarks
smart_toy
Game Arena
code
Code
comment
Discussions
school
Learn
expand_more
More
auto_awesome_motion
View Active Events
search
Sign In
Register
Kaggle uses cookies from Google to deliver and enhance the quality of its services and to analyze traffic.
Learn more
OK, Got it.
KAGGLE · PLAYGROUND PREDICTION COMPETITION · 5 DAYS TO GO
Join Competit

In [15]:
import time
import os
import undetected_chromedriver as uc
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

def setup_driver():
    """Sets up the undetected_chromedriver."""
    options = uc.ChromeOptions()
    # Running without headless mode is recommended to pass Cloudflare checks
    driver = uc.Chrome(options=options)
    return driver

def get_tab_content(driver, base_url, tab_name):
    """Navigates to a specific tab and extracts the main text content."""
    tab_url = f"{base_url}/{tab_name.lower()}"
    print(f"Crawling: {tab_url}...")
    driver.get(tab_url)

    try:
        wait = WebDriverWait(driver, 30)
        
        # 1. Wait for body to ensure page load
        wait.until(EC.presence_of_element_located((By.TAG_NAME, "body")))
        
        # 2. Check for Cloudflare/reCAPTCHA title
        if "reCAPTCHA" in driver.title or "Checking your browser" in driver.title:
            print(f"  -> Detected Cloudflare challenge on {tab_name}. Waiting for redirect...")
            time.sleep(10) 
        
        # 3. Try to find the main content
        try:
            # Kaggle content is usually in <main>
            main_element = wait.until(EC.presence_of_element_located((By.TAG_NAME, "main")))
        except:
            print(f"  -> Warning: <main> tag not found for {tab_name}, attempting fallback.")
            main_element = driver.find_element(By.TAG_NAME, "body")
        
        # 4. Allow dynamic content to render
        time.sleep(5)
        
        content = main_element.text
        
        # Basic validation
        if "Checking your browser" in content or len(content) < 200:
            return f"Error: Failed to bypass Cloudflare protection or content empty."

        # Clean up excessive newlines
        cleaned_content = "\n".join([line for line in content.split('\n') if line.strip()])
        return cleaned_content

    except Exception as e:
        return f"Error extracting content for {tab_name}: {str(e)}"

def get_filename_from_url(url):
    """Generates a filename from the URL slug."""
    if url.endswith('/'):
        url = url[:-1]
    # Extract the last part of the URL
    slug = url.split('/')[-1]
    return f"{slug}.txt"

def main():
    base_url = "https://www.kaggle.com/competitions/playground-series-s5e12"
    tabs = ["Overview", "Data", "Rules"]
    
    driver = setup_driver()
    results = {}
    
    try:
        for tab in tabs:
            content = get_tab_content(driver, base_url, tab)
            results[tab] = content
            print(f"--- Extracted {len(content)} characters from {tab} ---")
            print("-" * 30)
            
    finally:
        driver.quit()

    # Save to file
    filename = get_filename_from_url(base_url)
    
    try:
        with open(filename, "w", encoding="utf-8") as f:
            for i, tab in enumerate(tabs):
                content = results.get(tab, "")
                
                # Write Header
                f.write(f"{tab}\n")
                # Write Content
                f.write(f"{content}\n")
                
                # Write Separator (except after the last item)
                if i < len(tabs) - 1:
                    f.write("-----\n")
        
        print(f"\nSuccessfully saved content to: {os.path.abspath(filename)}")
        
    except IOError as e:
        print(f"Error writing to file: {e}")

if __name__ == "__main__":
    main()


Crawling: https://www.kaggle.com/competitions/playground-series-s5e12/overview...
--- Extracted 4189 characters from Overview ---
------------------------------
Crawling: https://www.kaggle.com/competitions/playground-series-s5e12/data...
--- Extracted 1909 characters from Data ---
------------------------------
Crawling: https://www.kaggle.com/competitions/playground-series-s5e12/rules...
--- Extracted 34000 characters from Rules ---
------------------------------

Successfully saved content to: /Users/minhtuan/Documents/Documents/Work/Hanoi/crawler/playground-series-s5e12.txt


In [6]:
import time
import os
import undetected_chromedriver as uc
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

def setup_driver():
    """Sets up the undetected_chromedriver."""
    options = uc.ChromeOptions()
    # Running with a visible window is recommended to pass Cloudflare checks
    driver = uc.Chrome(options=options)
    return driver

def get_tab_content(driver, base_url, tab_name):
    """Navigates to a specific tab and extracts the main text content."""
    tab_url = f"{base_url}/{tab_name.lower()}"
    print(f"Crawling: {tab_url}...")
    driver.get(tab_url)

    try:
        wait = WebDriverWait(driver, 30)
        wait.until(EC.presence_of_element_located((By.TAG_NAME, "body")))
        
        # Check for Cloudflare
        if "reCAPTCHA" in driver.title or "Checking your browser" in driver.title:
            print(f"  -> Detected Cloudflare challenge on {tab_name}. Waiting...")
            time.sleep(10) 
        
        try:
            main_element = wait.until(EC.presence_of_element_located((By.TAG_NAME, "main")))
        except:
            print(f"  -> Warning: <main> tag not found for {tab_name}, attempting fallback.")
            main_element = driver.find_element(By.TAG_NAME, "body")
        
        time.sleep(5) # Allow dynamic content to render
        
        content = main_element.text
        cleaned_content = "\n".join([line for line in content.split('\n') if line.strip()])
        return cleaned_content

    except Exception as e:
        return f"Error extracting content for {tab_name}: {str(e)}"

def get_code_content(driver, base_url):
    """
    Navigates to the Code tab, sorts by 'Most Votes', and extracts content from top 10 notebooks.
    """
    code_url = f"{base_url}/code"
    print(f"Crawling Code List: {code_url}...")
    driver.get(code_url)
    
    full_code_content = ""
    
    try:
        wait = WebDriverWait(driver, 30)
        wait.until(EC.presence_of_element_located((By.TAG_NAME, "body")))
        time.sleep(5) # Wait for initial load

        # --- 1. Handle Sorting (Hotness -> Most Votes) ---
        print("  -> Attempting to sort by 'Most Votes'...")
        try:
            # Find the dropdown trigger (defaults to "Hotness")
            # We use XPath to find the element containing the text "Hotness"
            hotness_btn = wait.until(EC.element_to_be_clickable((By.XPATH, "//*[contains(text(), 'Hotness')]")))
            hotness_btn.click()
            time.sleep(1)
            
            # Find and click "Most Votes" in the dropdown menu
            most_votes_btn = wait.until(EC.element_to_be_clickable((By.XPATH, "//*[contains(text(), 'Most Votes')]")))
            most_votes_btn.click()
            
            print("  -> Sort applied. Waiting for list to refresh...")
            time.sleep(5) 
        except Exception as e:
            print(f"  -> Warning: Could not interact with Sort dropdown (UI might have changed). Error: {e}")

        # --- 2. Extract Top 10 Notebook Links ---
        links_found = []
        elements = driver.find_elements(By.TAG_NAME, "a")
        
        for elem in elements:
            href = elem.get_attribute("href")
            # Filter for valid notebook links:
            # 1. Must contain '/code/'
            # 2. Must NOT be the main competition code tab (contains 'competitions')
            # 3. Must NOT be the 'New Notebook' link
            if href and "/code/" in href and "competitions" not in href and "/new" not in href:
                if href not in links_found:
                    links_found.append(href)
        
        top_10_links = links_found[:10]
        print(f"  -> Found {len(links_found)} notebooks. Processing top {len(top_10_links)}...")

        # --- 3. Crawl Each Notebook ---
        for i, link in enumerate(top_10_links):
            print(f"    [{i+1}/10] Crawling notebook: {link}")
            try:
                driver.get(link)
                # Wait for the notebook content (main tag)
                wait.until(EC.presence_of_element_located((By.TAG_NAME, "main")))
                time.sleep(5) # Wait for cells/markdown to render
                
                title = driver.title.replace(" | Kaggle", "")
                page_text = driver.find_element(By.TAG_NAME, "main").text
                
                # Clean up text
                cleaned_text = "\n".join([line for line in page_text.split('\n') if line.strip()])
                
                # Append to result string
                full_code_content += f"Notebook {i+1}: {title}\nURL: {link}\n"
                full_code_content += "-" * 20 + "\n"
                full_code_content += cleaned_text + "\n"
                full_code_content += "=" * 40 + "\n\n"
                
            except Exception as e:
                print(f"    -> Error crawling notebook {link}: {e}")
                full_code_content += f"Error crawling {link}: {e}\n\n"
                
    except Exception as e:
        return f"Error in Code tab processing: {str(e)}"
        
    return full_code_content

def get_filename_from_url(url):
    if url.endswith('/'):
        url = url[:-1]
    slug = url.split('/')[-1]
    return f"{slug}.txt"

def main():
    base_url = "https://www.kaggle.com/competitions/titanic"
    # Standard tabs
    tabs = ["Overview", "Data", "Rules"]
    
    driver = setup_driver()
    results = {}
    
    try:
        # 1. Crawl Standard Tabs
        for tab in tabs:
            content = get_tab_content(driver, base_url, tab)
            results[tab] = content
            print(f"--- Extracted {len(content)} characters from {tab} ---")
            print("-" * 30)
        
        # # 2. Crawl Code Tab (Special Logic)
        # code_content = get_code_content(driver, base_url)
        # results["Code"] = code_content
        # print(f"--- Extracted {len(code_content)} characters from Code (Top 10) ---")

    finally:
        driver.quit()

    # Save to file
    filename = get_filename_from_url(base_url)
    
    try:
        with open(filename, "w", encoding="utf-8") as f:
            # Write Standard Tabs
            for tab in tabs:
                content = results.get(tab, "")
                f.write(f"{tab}\n")
                f.write(f"{content}\n")
                f.write("-----\n")
            
            # Write Code Tab
            f.write("Code (Top 10 Most Votes)\n")
            f.write(results.get("Code", ""))
            f.write("-----\n")
        
        print(f"\nSuccessfully saved content to: {os.path.abspath(filename)}")
        
    except IOError as e:
        print(f"Error writing to file: {e}")

if __name__ == "__main__":
    main()


Crawling: https://www.kaggle.com/competitions/titanic/overview...
--- Extracted 10110 characters from Overview ---
------------------------------
Crawling: https://www.kaggle.com/competitions/titanic/data...
--- Extracted 3143 characters from Data ---
------------------------------
Crawling: https://www.kaggle.com/competitions/titanic/rules...
--- Extracted 24674 characters from Rules ---
------------------------------

Successfully saved content to: /Users/minhtuan/Documents/Documents/Work/Hanoi/crawler/titanic.txt


In [None]:
import time
import os
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

def get_notebook_content(url):
    """
    Crawls the text content of a Kaggle notebook with robust error handling.
    """
    options = Options()
    # Add arguments to improve stability and prevent crashes
    options.add_argument('--no-sandbox')
    options.add_argument('--disable-dev-shm-usage')
    options.add_argument('--disable-gpu')
    options.add_argument('--window-size=1920,1080')
    options.add_argument("user-agent=Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36")
    
    # Attempt to initialize the driver
    try:
        # Using standard Selenium with webdriver_manager to avoid undetected_chromedriver crashes
        driver_path = ChromeDriverManager().install()
        if "THIRD_PARTY_NOTICES" in driver_path:
            driver_dir = os.path.dirname(driver_path)
            driver_path = os.path.join(driver_dir, "chromedriver")

        if os.path.exists(driver_path):
            os.chmod(driver_path, 0o755)

        service = Service(driver_path)
        driver = webdriver.Chrome(service=service, options=options)
    except Exception as e:
        return f"Error initializing driver: {str(e)}"

    try:
        print(f"Crawling: {url}")
        driver.set_page_load_timeout(60)
        driver.get(url)

        # Simple wait as requested
        print("  -> Waiting for content to load...")
        time.sleep(5)

        # 1. Get Main Page Content (Title, Votes, etc.)
        print("  -> Extracting main page context...")
        main_content = driver.find_element(By.TAG_NAME, "body").text

        # 2. Switch to Notebook Iframe
        print("  -> Looking for notebook iframe...")
        iframe_content = ""
        try:
            # Wait for iframe to be present
            iframe = WebDriverWait(driver, 20).until(
                EC.presence_of_element_located((By.ID, "rendered-kernel-content"))
            )
            print("  -> Found iframe. Switching context...")
            driver.switch_to.frame(iframe)
            time.sleep(5) # Wait for iframe content to render
            
            # Scroll inside iframe
            print("  -> Scrolling inside iframe...")
            last_height = driver.execute_script("return document.body.scrollHeight")
            for _ in range(20):
                driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
                time.sleep(1)
                new_height = driver.execute_script("return document.body.scrollHeight")
                if new_height == last_height:
                    break
                last_height = new_height
            
            # Extract all text from iframe
            print("  -> Extracting notebook content...")
            iframe_content = driver.find_element(By.TAG_NAME, "body").text
            
            # Switch back
            driver.switch_to.default_content()
            
        except Exception as e:
            print(f"  -> Warning: Could not extract iframe content: {e}")
            
        # Combine
        content = f"=== MAIN PAGE ===\n{main_content}\n\n=== NOTEBOOK CONTENT ===\n{iframe_content}"
        
        # Basic validation
        if not content or len(content) < 500:
            return "Error: Extracted content seems short. The notebook might not have loaded completely."

        return content

    except Exception as e:
        return f"Error extracting notebook: {str(e)}"
    finally:
        # Ensure driver is closed properly
        try:
            driver.quit()
        except:
            pass

if __name__ == "__main__":
    target_url =  "https://www.kaggle.com/code/masayakawamata/s5e12-eda-xgb-competition-starter"
    
    notebook_text = get_notebook_content(target_url)
    
    filename = "s5e12-eda-xgb-competition-starter.txt"
    with open(filename, "w", encoding="utf-8") as f:
        f.write(notebook_text)
    print(f"\nSuccessfully saved content to: {filename}")

Crawling: https://www.kaggle.com/competitions/titanic/data
  -> Waiting for content to load...
  -> Extracting main page context...
  -> Looking for notebook iframe...
Stacktrace:
0   chromedriver                        0x00000001050c3dfc cxxbridge1$str$ptr + 3031016
1   chromedriver                        0x00000001050bbcb8 cxxbridge1$str$ptr + 2997924
2   chromedriver                        0x0000000104bb6b90 _RNvCsgXDX2mvAJAg_7___rustc35___rust_no_alloc_shim_is_unstable_v2 + 74192
3   chromedriver                        0x0000000104bfdab4 _RNvCsgXDX2mvAJAg_7___rustc35___rust_no_alloc_shim_is_unstable_v2 + 364788
4   chromedriver                        0x0000000104c3ea28 _RNvCsgXDX2mvAJAg_7___rustc35___rust_no_alloc_shim_is_unstable_v2 + 630888
5   chromedriver                        0x0000000104bf222c _RNvCsgXDX2mvAJAg_7___rustc35___rust_no_alloc_shim_is_unstable_v2 + 317548
6   chromedriver                        0x0000000105088194 cxxbridge1$str$ptr + 2786176
7   chromedriver     

In [3]:
import time
import os
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

# --- CONFIGURATION ---
# Add all the competition homepages you want to crawl here
URLS_TO_CRAWL = [
    "https://www.kaggle.com/competitions/titanic"
]

# URLS_TO_CRAWL = links 

# --- DRIVER SETUP ---
def setup_driver():
    """Sets up a stable Chrome driver with stealth options."""
    options = Options()
    options.add_argument('--no-sandbox')
    options.add_argument('--disable-dev-shm-usage')
    options.add_argument('--disable-gpu')
    options.add_argument('--window-size=1920,1080')
    options.add_argument("user-agent=Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36")
    
    # Use a persistent profile to avoid CAPTCHA on subsequent runs
    profile_path = os.path.join(os.getcwd(), "selenium_profile")
    options.add_argument(f"--user-data-dir={profile_path}")

    try:
        driver_path = ChromeDriverManager().install()
        if "THIRD_PARTY_NOTICES" in driver_path:
            driver_dir = os.path.dirname(driver_path)
            driver_path = os.path.join(driver_dir, "chromedriver")
        if os.path.exists(driver_path):
            os.chmod(driver_path, 0o755)
        service = Service(driver_path)
        driver = webdriver.Chrome(service=service, options=options)
        return driver
    except Exception as e:
        print(f"Fatal Error: Could not initialize driver: {e}")
        return None

# --- CRAWLING FUNCTIONS ---
def get_tab_content(driver, url):
    """Navigates to a specific tab URL and extracts the main text content."""
    print(f"  Crawling tab: {url}...")
    driver.get(url)
    try:
        # Wait for the main content area to load
        WebDriverWait(driver, 20).until(EC.presence_of_element_located((By.ID, "site-content")))
        time.sleep(3)
        return driver.find_element(By.ID, "site-content").text
    except Exception as e:
        print(f"    -> Warning: Could not extract content from {url}: {e}")
        return f"Error extracting content from {url}."

def get_top_notebook_links(driver, base_url):
    """Navigates to the Code tab, sorts by 'Most Votes', and returns top 10 links."""
    code_url = f"{base_url}/code"
    print(f"  Getting notebook links from: {code_url}...")
    driver.get(code_url)
    
    links = []
    try:
        wait = WebDriverWait(driver, 30)
        wait.until(EC.presence_of_element_located((By.TAG_NAME, "body")))
        time.sleep(5)

        # Sort by 'Most Votes'
        print("    -> Sorting by 'Most Votes'...")
        wait.until(EC.element_to_be_clickable((By.XPATH, "//*[contains(text(), 'Hotness')]"))).click()
        time.sleep(1)
        wait.until(EC.element_to_be_clickable((By.XPATH, "//*[contains(text(), 'Most Votes')]"))).click()
        time.sleep(5)
        
        # Extract links
        elements = driver.find_elements(By.TAG_NAME, "a")
        for elem in elements:
            href = elem.get_attribute("href")
            if href and "/code/" in href and "competitions" not in href and "/new" not in href:
                # FIX: Remove /comments suffix to get the notebook URL
                if href.endswith("/comments"):
                    href = href.replace("/comments", "")              
                if href not in links:
                    links.append(href)
        
        print(f"    -> Found {len(links)} links, taking top 10.")
        return links[:10]
    except Exception as e:
        print(f"    -> Error getting notebook links: {e}")
        return []

def get_single_notebook_content(driver, url):
    """Crawls the full content of a single notebook page, handling the iframe."""
    print(f"    Crawling notebook: {url}")
    driver.set_page_load_timeout(60)
    driver.get(url)

    # Simple wait as requested
    print("  -> Waiting for content to load...")
    time.sleep(5)

    # 1. Get Main Page Content (Title, Votes, etc.)
    print("  -> Extracting main page context...")
    main_content = driver.find_element(By.TAG_NAME, "body").text

    # 2. Switch to Notebook Iframe
    print("  -> Looking for notebook iframe...")
    iframe_content = ""
    try:
        # Wait for iframe to be present
        iframe = WebDriverWait(driver, 20).until(
            EC.presence_of_element_located((By.ID, "rendered-kernel-content"))
        )
        print("  -> Found iframe. Switching context...")
        driver.switch_to.frame(iframe)
        time.sleep(5) # Wait for iframe content to render
        
        # Scroll inside iframe
        print("  -> Scrolling inside iframe...")
        last_height = driver.execute_script("return document.body.scrollHeight")
        for _ in range(20):
            driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
            time.sleep(1)
            new_height = driver.execute_script("return document.body.scrollHeight")
            if new_height == last_height:
                break
            last_height = new_height
        
        # Extract all text from iframe
        print("  -> Extracting notebook content...")
        iframe_content = driver.find_element(By.TAG_NAME, "body").text
        
        # Switch back
        driver.switch_to.default_content()
            
    except Exception as e:
        iframe_content = f"Error extracting notebook content from iframe: {e}"
    finally:
        # Always switch back to the main page context
        driver.switch_to.default_content()
    
    # Combine
    content = f"=== MAIN PAGE ===\n{main_content}\n\n=== NOTEBOOK CONTENT ===\n{iframe_content}"
    
    # Basic validation
    if not content or len(content) < 500:
        return "Error: Extracted content seems short. The notebook might not have loaded completely."

    return iframe_content

# --- HELPER FUNCTIONS ---
def slugify(url, option):
    """Creates a clean filename or directory name from a URL."""
    if url.endswith('/'):
        url = url[:-1]
    return url.split('/')[option].replace('?','-').replace('=','-')

# --- MAIN WORKFLOW ---
def main():
    """Main workflow to crawl competitions and their notebooks."""
    driver = setup_driver()
    if not driver:
        return

    try:
        for base_url in URLS_TO_CRAWL:
            print(f"\n--- Starting Competition: {base_url} ---")
            
            # 1. Setup directories
            comp_slug = slugify(base_url, -1)
            os.makedirs(comp_slug, exist_ok=True)
            notebooks_dir = os.path.join(comp_slug, "notebooks")
            os.makedirs(notebooks_dir, exist_ok=True)
            
            # 2. Crawl main competition tabs
            results = {}
            tabs_to_crawl = ["Overview", "Data", "Rules"]
            for tab in tabs_to_crawl:
                results[tab] = get_tab_content(driver, f"{base_url}/{tab.lower()}")
            
            # 3. Get notebook links from the "Code" tab
            top_links = get_top_notebook_links(driver, base_url)
            results["Code"] = "Top 10 Notebook Links:\n" + "\n".join(top_links)
            
            # 4. Save main competition file
            main_filename = os.path.join(comp_slug, f"{comp_slug}.txt")
            with open(main_filename, "w", encoding="utf-8") as f:
                for tab, content in results.items():
                    f.write(f"=== {tab.upper()} ===\n")
                    f.write(content + "\n\n")
            print(f"\nSaved main competition data to: {main_filename}")

            # 5. Crawl each notebook from the links found
            print(f"\n--- Crawling {len(top_links)} notebooks for {comp_slug} ---")
            for link in top_links:
                notebook_slug = slugify(link, -1)
                notebook_filename = os.path.join(notebooks_dir, f"{notebook_slug}.txt")
                
                # Get content
                notebook_content = get_single_notebook_content(driver, link)
                
                # Save content
                with open(notebook_filename, "w", encoding="utf-8") as f:
                    f.write(notebook_content)
                print(f"      -> Saved to {notebook_filename}")

    finally:
        print("\nWorkflow finished. Closing driver.")
        driver.quit()

if __name__ == "__main__":
    main()



--- Starting Competition: https://www.kaggle.com/competitions/titanic ---
  Crawling tab: https://www.kaggle.com/competitions/titanic/overview...
Stacktrace:
0   chromedriver                        0x000000010104bdfc cxxbridge1$str$ptr + 3031016
1   chromedriver                        0x0000000101043cb8 cxxbridge1$str$ptr + 2997924
2   chromedriver                        0x0000000100b3eb90 _RNvCsgXDX2mvAJAg_7___rustc35___rust_no_alloc_shim_is_unstable_v2 + 74192
3   chromedriver                        0x0000000100b85ab4 _RNvCsgXDX2mvAJAg_7___rustc35___rust_no_alloc_shim_is_unstable_v2 + 364788
4   chromedriver                        0x0000000100bc6a28 _RNvCsgXDX2mvAJAg_7___rustc35___rust_no_alloc_shim_is_unstable_v2 + 630888
5   chromedriver                        0x0000000100b7a22c _RNvCsgXDX2mvAJAg_7___rustc35___rust_no_alloc_shim_is_unstable_v2 + 317548
6   chromedriver                        0x0000000101010194 cxxbridge1$str$ptr + 2786176
7   chromedriver                        0x