In [3]:
import asyncio
import re
from playwright.async_api import async_playwright, TimeoutError as PlaywrightTimeoutError, Error as PlaywrightError

async def scrape_xueqiu_playwright_async(url):
    """
    Scrapes the main post and comments from a Xueqiu article URL using Playwright's Async API.

    Args:
        url (str): The URL of the Xueqiu post.

    Returns:
        dict: A dictionary containing 'main_post' and 'comments' list.
    """
    scraped_data = {"main_post": None, "comments": []}
    browser = None

    print("--- Starting Playwright Async Scraper ---")
    try:
        async with async_playwright() as p:
            print("Launching Chromium browser (async)...")
            browser = await p.chromium.launch(headless=False, args=["--start-maximized"])
            context = await browser.new_context(
                no_viewport=True,
                user_agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/100.0.4896.127 Safari/537.36",
                locale="zh-CN"
            )
            page = await context.new_page()

            print(f"Navigating to: {url} (async)")
            await page.goto(url, timeout=60000, wait_until='domcontentloaded')
            print("Page loaded (DOM content). Waiting for potential dynamic elements...")
            await page.wait_for_timeout(2000)

            # --- Handle Pop-ups (Async) ---
            popup_timeout = 10000

            # 1. First Pop-up ("跳过")
            print("Looking for the first pop-up ('跳过')...")
            skip_locator_css = 'span:text-is("跳过"), button:has-text("跳过")'
            try:
                skip_button = page.locator(skip_locator_css).first
                await skip_button.wait_for(state='visible', timeout=popup_timeout)
                print("First pop-up '跳过' button found. Clicking...")
                await skip_button.click(timeout=5000)
                print("Clicked '跳过'.")
                await page.wait_for_timeout(1000)
            except PlaywrightTimeoutError: # Use the aliased TimeoutError
                print("First pop-up ('跳过') did not appear or timed out.")
            except PlaywrightError as e:
                print(f"Error interacting with first pop-up: {e}")

            # 2. Second Pop-up ("X")
            print("Looking for the second pop-up ('X')...")
            close_selectors_css = [
                "div.modal-wrapper i.icon-close", "div.xq-dialog-wrapper i.close",
                "i.cube-dialog-close", "div.Modal_modal i.Modal_closeIcon",
                "div[aria-label='Close']", "button[aria-label='Close']",
            ]
            close_button_found = False
            for selector in close_selectors_css:
                try:
                    print(f"Trying close selector: {selector}")
                    close_button = page.locator(selector).first
                    await close_button.wait_for(state='visible', timeout=popup_timeout / len(close_selectors_css))
                    print(f"Second pop-up 'X' button found with selector: {selector}. Clicking...")
                    await close_button.click(timeout=5000)
                    print("Clicked 'X' on the second pop-up.")
                    close_button_found = True
                    await page.wait_for_timeout(1000)
                    break
                except PlaywrightTimeoutError: continue
                except PlaywrightError as e: print(f"Error interacting with second pop-up using selector '{selector}': {e}"); continue
            if not close_button_found: print("Second pop-up (close 'X') not found with attempted selectors.")

            # --- Scrape Main Post Content (Async) ---
            print("Scraping main post content...")
            post_locator_css = "div.article__content, div.article__bd__detail"
            try:
                post_element = page.locator(post_locator_css).first
                await post_element.wait_for(state="visible", timeout=20000)
                scraped_data["main_post"] = await post_element.text_content()
                print("Main post content scraped successfully.")
            except PlaywrightTimeoutError: print("Main post content not found or not visible."); await page.screenshot(path="playwright_debug_no_main_post.png")
            except PlaywrightError as e: print(f"Error scraping main post: {e}"); await page.screenshot(path="playwright_debug_error_main_post.png")

            # --- Click on Comments Tab (Async) ---
            comments_tab_clicked = False
            print("Looking for the '评论' (Comments) tab...")
            tab_locator_css = "div.tabs__item:has(span:text-is('评论')), div.action-bar__item:has-text('评论')"
            try:
                comments_tab = page.locator(tab_locator_css).first
                await comments_tab.wait_for(state="visible", timeout=20000)
                print("Comments tab found. Clicking...")
                await comments_tab.click(timeout=10000)
                print("Clicked '评论' tab.")
                comments_tab_clicked = True
                await page.wait_for_timeout(2500)
            except PlaywrightTimeoutError: print("Comments tab ('评论') not found or not visible/clickable."); await page.screenshot(path="playwright_debug_no_comment_tab.png")
            except PlaywrightError as e: print(f"Error clicking comments tab: {e}"); await page.screenshot(path="playwright_debug_error_comment_tab.png")

            # --- Scrape Comments (Async) ---
            if comments_tab_clicked:
                print("Attempting to scrape comments...")
                print("Scrolling down page...")
                await page.evaluate("window.scrollTo(0, document.body.scrollHeight*0.8)")
                await page.wait_for_timeout(1000)
                await page.evaluate("window.scrollTo(0, document.body.scrollHeight)")
                await page.wait_for_timeout(2000)

                comment_locator_css = "div.comment__item__main > p"
                print(f"Waiting for the first comment element ('{comment_locator_css}') to be visible...")
                try:
                    await page.locator(comment_locator_css).first.wait_for(state="visible", timeout=25000)
                    print("First comment element visible.")
                    print("Extracting all comment texts...")
                    comment_elements = page.locator(comment_locator_css)
                    all_texts = await comment_elements.all_text_contents()

                    if all_texts:
                        count = 0
                        for text in all_texts:
                            cleaned_text = text.strip()
                            if cleaned_text and "回复@" not in cleaned_text[:5] and "查看回复" not in cleaned_text and "查看对话" not in cleaned_text:
                                scraped_data["comments"].append(cleaned_text)
                                count += 1
                        print(f"Successfully scraped {count} non-empty comments.")
                    else: print("Located comment elements, but failed to extract text."); await page.screenshot(path="playwright_debug_comments_found_but_empty.png")
                except PlaywrightTimeoutError:
                    print(f"Timed out waiting for the first comment element ('{comment_locator_css}') to become visible.")
                    await page.screenshot(path="playwright_debug_timeout_waiting_for_comments.png")
                except PlaywrightError as e: print(f"Error scraping comments: {e}"); await page.screenshot(path="playwright_debug_error_scraping_comments.png")
            else: print("Skipping comment scraping because '评论' tab was not successfully clicked.")

            print("\n--- Taking final screenshot ---")
            await page.screenshot(path="playwright_final_state.png")
            print("Saved final screenshot: playwright_final_state.png")

            print("Closing browser context and browser (async)...")
            await context.close()
            await browser.close()

    except Exception as e:
        print(f"\n--- An critical error occurred during the Playwright Async process ---")
        print(f"Error Type: {type(e).__name__}")
        print(f"Error Details: {e}")
        if browser and not browser.is_closed(): # Check if browser exists and is not already closed
            try:
                print("Attempting to close browser after error (async)...")
                await browser.close()
                print("Browser closed after error.")
            except Exception as close_err:
                print(f"Error closing browser after main error: {close_err}")

    print("--- Playwright Async Scraper Finished ---")
    return scraped_data

# --- How to run in a Jupyter Notebook ---
async def main():
    target_url = "https://xueqiu.com/5669998349/334081638"
    data = await scrape_xueqiu_playwright_async(target_url)

    print("\n" + "="*30)
    print("      Scraped Data Summary (Playwright Async)")
    print("="*30)
    print("\n--- Main Post ---")
    if data["main_post"]: print(data["main_post"][:500] + ('...' if len(data["main_post"]) > 500 else ''))
    else: print(">>> Main post content not found or scraping failed. <<<")
    print("\n--- Comments ---")
    if data["comments"]:
        print(f"Found {len(data['comments'])} comments:")
        for i, comment in enumerate(data["comments"]): print(f"{i+1}. {comment[:150]}" + ('...' if len(comment) > 150 else ''))
    else: print(">>> No comments found or scraping failed. <<<")
    print("\n" + "="*30)
    print("Check console logs and playwright_debug_*.png files for details.")

# To run this in a Jupyter Notebook cell, you execute it like this:
# await main()
#
# Or, if you're in a regular Python script or an environment that
# doesn't automatically handle top-level await (like older Python versions):
# if __name__ == "__main__":
# asyncio.run(main())

In [4]:
await main()

--- Starting Playwright Async Scraper ---

--- An critical error occurred during the Playwright Async process ---
Error Type: NotImplementedError
Error Details: 
--- Playwright Async Scraper Finished ---

      Scraped Data Summary (Playwright Async)

--- Main Post ---
>>> Main post content not found or scraping failed. <<<

--- Comments ---
>>> No comments found or scraping failed. <<<

Check console logs and playwright_debug_*.png files for details.


In [5]:
import asyncio
import re
from playwright.async_api import async_playwright, TimeoutError as PlaywrightTimeoutError, Error as PlaywrightError
import nest_asyncio # <--- Import nest_asyncio

# Apply nest_asyncio. This should be done once, at the top of your notebook or script.
nest_asyncio.apply()

async def scrape_xueqiu_playwright_async(url):
    """
    Scrapes the main post and comments from a Xueqiu article URL using Playwright's Async API.
    (Content of this function remains the same as the previous async version)
    """
    scraped_data = {"main_post": None, "comments": []}
    browser = None

    print("--- Starting Playwright Async Scraper (with nest_asyncio) ---")
    try:
        async with async_playwright() as p:
            print("Launching Chromium browser (async)...")
            # Try without --start-maximized initially to see if it's related
            browser = await p.chromium.launch(headless=False) # Removed args=["--start-maximized"] for now
            context = await browser.new_context(
                # Keep viewport reasonable if not maximizing
                # viewport={'width': 1280, 'height': 720}, # Example viewport
                no_viewport=True, # Or stick to no_viewport if window maximization works
                user_agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/100.0.4896.127 Safari/537.36",
                locale="zh-CN"
            )
            page = await context.new_page()
            # If not using no_viewport, you might need to maximize after page creation
            # await page.set_viewport_size({"width": 1920, "height": 1080}) # If needed

            print(f"Navigating to: {url} (async)")
            await page.goto(url, timeout=60000, wait_until='domcontentloaded')
            print("Page loaded (DOM content). Waiting for potential dynamic elements...")
            await page.wait_for_timeout(3000) # Increased initial wait

            # --- Handle Pop-ups (Async) ---
            popup_timeout = 10000

            # 1. First Pop-up ("跳过")
            print("Looking for the first pop-up ('跳过')...")
            skip_locator_css = 'span:text-is("跳过"), button:has-text("跳过")'
            try:
                skip_button = page.locator(skip_locator_css).first
                await skip_button.wait_for(state='visible', timeout=popup_timeout)
                print("First pop-up '跳过' button found. Clicking...")
                await skip_button.click(timeout=5000, force=True) # Try with force=True
                print("Clicked '跳过'.")
                await page.wait_for_timeout(1500) # Increased pause
            except PlaywrightTimeoutError:
                print("First pop-up ('跳过') did not appear or timed out.")
            except PlaywrightError as e:
                print(f"Error interacting with first pop-up: {e}")

            # 2. Second Pop-up ("X")
            print("Looking for the second pop-up ('X')...")
            close_selectors_css = [
                "div.modal-wrapper i.icon-close", "div.xq-dialog-wrapper i.close",
                "i.cube-dialog-close", "div.Modal_modal i.Modal_closeIcon",
                "div[aria-label='Close']", "button[aria-label='Close']",
            ]
            close_button_found = False
            for selector in close_selectors_css:
                try:
                    print(f"Trying close selector: {selector}")
                    close_button = page.locator(selector).first
                    await close_button.wait_for(state='visible', timeout=popup_timeout / len(close_selectors_css))
                    print(f"Second pop-up 'X' button found with selector: {selector}. Clicking...")
                    await close_button.click(timeout=5000, force=True) # Try with force=True
                    print("Clicked 'X' on the second pop-up.")
                    close_button_found = True
                    await page.wait_for_timeout(1500) # Increased pause
                    break
                except PlaywrightTimeoutError: continue
                except PlaywrightError as e: print(f"Error interacting with second pop-up using selector '{selector}': {e}"); continue
            if not close_button_found: print("Second pop-up (close 'X') not found with attempted selectors.")

            # --- Scrape Main Post Content (Async) ---
            print("Scraping main post content...")
            post_locator_css = "div.article__content, div.article__bd__detail"
            try:
                post_element = page.locator(post_locator_css).first
                await post_element.wait_for(state="visible", timeout=20000)
                scraped_data["main_post"] = await post_element.text_content()
                print("Main post content scraped successfully.")
            except PlaywrightTimeoutError: print("Main post content not found or not visible."); await page.screenshot(path="playwright_debug_no_main_post.png")
            except PlaywrightError as e: print(f"Error scraping main post: {e}"); await page.screenshot(path="playwright_debug_error_main_post.png")

            # --- Click on Comments Tab (Async) ---
            comments_tab_clicked = False
            print("Looking for the '评论' (Comments) tab...")
            tab_locator_css = "div.tabs__item:has(span:text-is('评论')), div.action-bar__item:has-text('评论')"
            try:
                comments_tab = page.locator(tab_locator_css).first
                await comments_tab.wait_for(state="visible", timeout=25000) # Increased wait
                print("Comments tab found. Clicking...")
                await comments_tab.click(timeout=10000, force=True) # Try with force=True
                print("Clicked '评论' tab.")
                comments_tab_clicked = True
                await page.wait_for_timeout(3000) # Increased wait
            except PlaywrightTimeoutError: print("Comments tab ('评论') not found or not visible/clickable."); await page.screenshot(path="playwright_debug_no_comment_tab.png")
            except PlaywrightError as e: print(f"Error clicking comments tab: {e}"); await page.screenshot(path="playwright_debug_error_comment_tab.png")

            # --- Scrape Comments (Async) ---
            if comments_tab_clicked:
                print("Attempting to scrape comments...")
                print("Scrolling down page...")
                await page.evaluate("window.scrollTo(0, document.body.scrollHeight*0.8)")
                await page.wait_for_timeout(1500) # Increased
                await page.evaluate("window.scrollTo(0, document.body.scrollHeight)")
                await page.wait_for_timeout(2500) # Increased wait after scroll

                comment_locator_css = "div.comment__item__main > p"
                print(f"Waiting for the first comment element ('{comment_locator_css}') to be visible...")
                try:
                    await page.locator(comment_locator_css).first.wait_for(state="visible", timeout=30000) # Increased wait
                    print("First comment element visible.")
                    print("Extracting all comment texts...")
                    comment_elements = page.locator(comment_locator_css)
                    all_texts = await comment_elements.all_text_contents()

                    if all_texts:
                        count = 0
                        for text in all_texts:
                            cleaned_text = text.strip()
                            if cleaned_text and "回复@" not in cleaned_text[:5] and "查看回复" not in cleaned_text and "查看对话" not in cleaned_text:
                                scraped_data["comments"].append(cleaned_text)
                                count += 1
                        print(f"Successfully scraped {count} non-empty comments.")
                    else: print("Located comment elements, but failed to extract text."); await page.screenshot(path="playwright_debug_comments_found_but_empty.png")
                except PlaywrightTimeoutError:
                    print(f"Timed out waiting for the first comment element ('{comment_locator_css}') to become visible.")
                    await page.screenshot(path="playwright_debug_timeout_waiting_for_comments.png")
                except PlaywrightError as e: print(f"Error scraping comments: {e}"); await page.screenshot(path="playwright_debug_error_scraping_comments.png")
            else: print("Skipping comment scraping because '评论' tab was not successfully clicked.")

            print("\n--- Taking final screenshot ---")
            await page.screenshot(path="playwright_final_state.png")
            print("Saved final screenshot: playwright_final_state.png")

            print("Closing browser context and browser (async)...")
            await context.close()
            await browser.close()

    except Exception as e:
        print(f"\n--- An critical error occurred during the Playwright Async process ---")
        print(f"Error Type: {type(e).__name__}")
        print(f"Error Details: {e}") # This might now show more details
        if browser and not browser.is_closed():
            try:
                print("Attempting to close browser after error (async)...")
                await browser.close()
                print("Browser closed after error.")
            except Exception as close_err:
                print(f"Error closing browser after main error: {close_err}")

    print("--- Playwright Async Scraper Finished ---")
    return scraped_data

# --- How to run in a Jupyter Notebook ---
async def main(): # Keep this as an async function
    target_url = "https://xueqiu.com/5669998349/334081638"
    data = await scrape_xueqiu_playwright_async(target_url) # await the call

    print("\n" + "="*30)
    print("      Scraped Data Summary (Playwright Async w/ nest_asyncio)")
    print("="*30)
    print("\n--- Main Post ---")
    if data["main_post"]: print(data["main_post"][:500] + ('...' if len(data["main_post"]) > 500 else ''))
    else: print(">>> Main post content not found or scraping failed. <<<")
    print("\n--- Comments ---")
    if data["comments"]:
        print(f"Found {len(data['comments'])} comments:")
        for i, comment in enumerate(data["comments"]): print(f"{i+1}. {comment[:150]}" + ('...' if len(comment) > 150 else ''))
    else: print(">>> No comments found or scraping failed. <<<")
    print("\n" + "="*30)
    print("Check console logs and playwright_debug_*.png files for details.")

# To run this in a Jupyter Notebook cell, you STILL execute it like this:
# await main()
#
# OR, because nest_asyncio is applied, you can sometimes get away with asyncio.run
# if the top-level await isn't behaving as expected, but await main() is preferred.
# if __name__ == "__main__":
# asyncio.run(main())

In [None]:
await main()

--- Starting Playwright Async Scraper (with nest_asyncio) ---

--- An critical error occurred during the Playwright Async process ---
Error Type: NotImplementedError
Error Details: 
--- Playwright Async Scraper Finished ---

      Scraped Data Summary (Playwright Async w/ nest_asyncio)

--- Main Post ---
>>> Main post content not found or scraping failed. <<<

--- Comments ---
>>> No comments found or scraping failed. <<<

Check console logs and playwright_debug_*.png files for details.


: 