In [1]:
#%pip install selenium
from selenium import webdriver


driver = webdriver.Chrome()
driver.get('https://selenium.dev/')
driver.quit()

In [22]:
import time
import os
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import (
    TimeoutException,
    NoSuchElementException,
    ElementClickInterceptedException
)

def scroll_and_screenshot_by_distance(url, scroll_pause_time=1.5, scroll_increment_js="window.scrollBy(0, window.innerHeight);"):
    """
    Navigates to a URL, handles popups, scrolls down in increments, taking a screenshot
    at each step until the bottom of the page is reached.

    Args:
        url (str): The URL of the page to scroll.
        scroll_pause_time (float): Time in seconds to wait after each scroll
                                   for content to potentially load.
        scroll_increment_js (str): JavaScript to execute for scrolling.
                                   Default scrolls by one viewport height.
    """
    print("Setting up WebDriver...")
    options = webdriver.ChromeOptions()
    # Keep browser visible to observe
    # options.add_argument("--headless")
    options.add_argument("--disable-gpu"); options.add_argument("--no-sandbox")
    options.add_argument("--disable-dev-shm-usage"); options.add_argument("--window-size=1200,800")
    options.add_argument("--disable-notifications"); options.add_argument("--lang=zh-CN")
    options.add_argument("user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/100.0.4896.127 Safari/537.36")
    options.add_experimental_option('excludeSwitches', ['enable-logging'])

    screenshot_dir = "screenshots_incremental_scroll"
    if not os.path.exists(screenshot_dir):
        try: os.makedirs(screenshot_dir); print(f"Created '{screenshot_dir}' directory.")
        except OSError as e: print(f"Error creating screenshot directory: {e}"); return

    driver = None
    try:
        driver = webdriver.Chrome(options=options)
        # driver.maximize_window() # Maximize or set specific size above

        print(f"Navigating to: {url}")
        driver.get(url)
        try:
            WebDriverWait(driver, 20).until(EC.presence_of_element_located((By.TAG_NAME, "body")))
            print("Body element loaded.")
        except TimeoutException:
            print("Page body did not become present within timeout.")
        time.sleep(3)

        # --- Handle Initial Pop-ups (Integrated robust logic) ---
        try:
            print("Looking for the first pop-up ('跳过')..."); skip_xpath = "//span[text()='跳过'] | //button[contains(.,'跳过')]"; skip_button = WebDriverWait(driver, 10).until(EC.element_to_be_clickable((By.XPATH, skip_xpath)))
            print("Clicking '跳过'..."); driver.execute_script("arguments[0].click();", skip_button); time.sleep(0.5)
        except TimeoutException: print("First pop-up ('跳过') not found or timed out.")
        except Exception as e: print(f"Error handling first pop-up: {e}")
        try:
            print("Looking for the second pop-up ('X')..."); close_xpaths = [ "//div[contains(@class,'modal-wrapper')]//i[contains(@class,'icon-close')]", "//div[@class='xq-dialog-wrapper']//i[contains(@class,'close')]", "//i[contains(@class, 'cube-dialog-close')]", "//div[contains(@class, 'Modal_modal')]//i[contains(@class, 'Modal_closeIcon')]", "//button[@aria-label='Close']" ]
            close_button_found = False
            for xpath in close_xpaths:
                 try: close_button = WebDriverWait(driver, 5).until(EC.element_to_be_clickable((By.XPATH, xpath))); print(f"Clicking second pop-up 'X'..."); driver.execute_script("arguments[0].click();", close_button); close_button_found = True; time.sleep(0.5); break
                 except TimeoutException: continue
                 except Exception: continue
            if not close_button_found: print("Second pop-up ('X') not found or timed out.")
        except Exception as e: print(f"Error during second pop-up handling: {e}")
        # --- End Pop-up Handling ---

        # Optional: Click 'Comments' Tab
        # try:
        #     print("Looking for and clicking '评论' tab before scrolling...")
        #     tab_xpath = "//span[text()='评论' and contains(@class,'tabs__item__title')]/ancestor::div[contains(@class,'tabs__item')] | //div[contains(@class, 'tabs__item') and .//span[text()='评论']] | //div[contains(@class, 'action-bar__item') and contains(., '评论')]"
        #     comments_tab_element = WebDriverWait(driver, 15).until(EC.presence_of_element_located((By.XPATH, tab_xpath)))
        #     driver.execute_script("arguments[0].scrollIntoView({block: 'center', inline: 'nearest'});", comments_tab_element)
        #     time.sleep(0.5)
        #     comments_tab_clickable = WebDriverWait(driver, 15).until(EC.element_to_be_clickable((By.XPATH, tab_xpath)))
        #     driver.execute_script("arguments[0].click();", comments_tab_clickable)
        #     print("Clicked '评论' tab. Waiting before scroll...")
        #     time.sleep(2.5)
        # except Exception as e_tab:
        #     print(f"Could not click '评论' tab before scrolling (maybe not needed or error): {e_tab}")

        print("\n--- Starting Incremental Scroll and Screenshot Process ---")
        last_height = driver.execute_script("return document.body.scrollHeight")
        scroll_attempt = 0

        while True:
            scroll_attempt += 1
            print(f"--- Scroll Attempt #{scroll_attempt} ---")

            screenshot_path = os.path.join(screenshot_dir, f"scroll_{scroll_attempt:02d}_before.png")
            try:
                driver.save_screenshot(screenshot_path)
                print(f"Saved screenshot: {screenshot_path}")
            except Exception as e_ss:
                print(f"Failed to save screenshot {screenshot_path}: {e_ss}")

            print(f"Scrolling down using: {scroll_increment_js}")
            driver.execute_script(scroll_increment_js)

            print(f"Pausing for {scroll_pause_time} seconds...")
            time.sleep(scroll_pause_time)

            new_height = driver.execute_script("return document.body.scrollHeight")
            current_scroll_y = driver.execute_script("return window.pageYOffset || document.documentElement.scrollTop;")
            viewport_height = driver.execute_script("return window.innerHeight;")
            print(f"  Current scrollY: {round(current_scroll_y)}, Viewport height: {round(viewport_height)}, Total scrollHeight: {new_height}")

            if current_scroll_y + viewport_height >= new_height - 10:
                print("Reached the bottom of the page.")
                final_bottom_path = os.path.join(screenshot_dir, f"scroll_{scroll_attempt:02d}_at_bottom.png")
                try: driver.save_screenshot(final_bottom_path)
                except Exception as e_fin_ss: print(f"Could not save final screenshot: {e_fin_ss}")
                print(f"Saved final bottom screenshot: {final_bottom_path}")
                break
            elif scroll_attempt > 50:
                print("Reached max scroll attempts (50). Stopping.")
                break
            last_height = new_height
        print("\n--- Incremental Scroll and Screenshot Process Finished ---")

    except Exception as e:
        print(f"\n--- An critical error occurred ---")
        print(f"Error Type: {type(e).__name__}")
        print(f"Error Details: {e}")
        # --- CORRECTED INDENTATION FOR ERROR SCREENSHOT TRY-EXCEPT ---
        if driver:
            try:
                error_ss_path = os.path.join(screenshot_dir, "critical_error_scroll_script.png")
                driver.save_screenshot(error_ss_path)
                print(f"Saved error screenshot: {error_ss_path}")
            except Exception as e_ss_crit:
                 print(f"Could not save critical error screenshot: {e_ss_crit}")
        # --- END CORRECTION ---
    finally:
        if driver:
            print("Closing the browser...")
            driver.quit()
            print("Browser closed.")

# --- Main execution block ---
if __name__ == "__main__":
    target_url = "https://xueqiu.com/5669998349/334081638"
    print(f"--- Starting Incremental Scroll for URL: {target_url} ---")
    scroll_and_screenshot_by_distance(
        target_url,
        scroll_pause_time=2.0,
        scroll_increment_js="window.scrollBy(0, window.innerHeight * 0.8);"
        )
    print("--- Script Finished ---")
    print(f"Check the '{screenshot_dir}' folder for screenshots.")

--- Starting Incremental Scroll for URL: https://xueqiu.com/5669998349/334081638 ---
Setting up WebDriver...
Created 'screenshots_incremental_scroll' directory.
Navigating to: https://xueqiu.com/5669998349/334081638
Body element loaded.
Looking for the first pop-up ('跳过')...
First pop-up ('跳过') not found or timed out.
Looking for the second pop-up ('X')...
Second pop-up ('X') not found or timed out.

--- Starting Incremental Scroll and Screenshot Process ---
--- Scroll Attempt #1 ---
Saved screenshot: screenshots_incremental_scroll\scroll_01_before.png
Scrolling down using: window.scrollBy(0, window.innerHeight * 0.8);
Pausing for 2.0 seconds...
  Current scrollY: 482, Viewport height: 602, Total scrollHeight: 4924
--- Scroll Attempt #2 ---
Saved screenshot: screenshots_incremental_scroll\scroll_02_before.png
Scrolling down using: window.scrollBy(0, window.innerHeight * 0.8);
Pausing for 2.0 seconds...
  Current scrollY: 963, Viewport height: 602, Total scrollHeight: 4924
--- Scroll At

NameError: name 'screenshot_dir' is not defined

In [26]:
import time
import os
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import (
    TimeoutException,
    NoSuchElementException,
    ElementClickInterceptedException,
    StaleElementReferenceException
)

def scrape_post_and_comments_on_scroll(url, max_scroll_loops=15, scroll_pause_time=2.5):
    """
    Navigates to a Xueqiu post, handles popups, scrolls to load comments,
    and attempts to scrape them. Also clicks 'expand replies' and 'load more'.
    """
    print("Setting up WebDriver...")
    options = webdriver.ChromeOptions()
    # options.add_argument("--headless")
    options.add_argument("--disable-gpu"); options.add_argument("--no-sandbox")
    options.add_argument("--disable-dev-shm-usage"); options.add_argument("--window-size=1200,900")
    options.add_argument("--disable-notifications"); options.add_argument("--lang=zh-CN")
    options.add_argument("user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/100.0.4896.127 Safari/537.36")
    options.add_experimental_option('excludeSwitches', ['enable-logging'])

    screenshot_dir = "screenshots_post_comments"
    if not os.path.exists(screenshot_dir):
        try: os.makedirs(screenshot_dir); print(f"Created '{screenshot_dir}' directory.")
        except OSError as e: print(f"Error creating screenshot directory: {e}"); return {"post_content": None, "comments": []}

    driver = None
    scraped_data = {"post_content": None, "comments": []}
    unique_comment_texts_scraped = set()

    try:
        driver = webdriver.Chrome(options=options)
        driver.maximize_window()

        wait = WebDriverWait(driver, 20)
        short_wait = WebDriverWait(driver, 7)

        print(f"Navigating to: {url}")
        driver.get(url)
        try:
            WebDriverWait(driver, 20).until(EC.presence_of_element_located((By.XPATH, "//div[contains(@class, 'article__bd__detail')] | //div[contains(@class, 'article__content')]")))
            print("Article body indicator loaded.")
        except TimeoutException:
            print("Article body indicator did not load. Page might be different."); driver.save_screenshot(os.path.join(screenshot_dir,"error_page_load.png")); return scraped_data
        time.sleep(2)

        # --- Handle Initial Pop-ups ---
        try:
            print("Looking for '跳过' pop-up..."); skip_xpath = "//span[text()='跳过'] | //button[contains(.,'跳过')]"; skip_button = WebDriverWait(driver, 10).until(EC.element_to_be_clickable((By.XPATH, skip_xpath)))
            driver.execute_script("arguments[0].click();", skip_button); print("Clicked '跳过'."); time.sleep(0.5)
        except TimeoutException: print("'跳过' pop-up not found/timed out.")
        except Exception as e: print(f"Error '跳过': {e}")
        try:
            print("Looking for 'X' pop-up..."); close_xpaths = [ "//div[contains(@class,'modal-wrapper')]//i[contains(@class,'icon-close')]", "//i[contains(@class, 'cube-dialog-close')]" ]
            for xpath in close_xpaths:
                 try: close_button = WebDriverWait(driver, 4).until(EC.element_to_be_clickable((By.XPATH, xpath))); driver.execute_script("arguments[0].click();", close_button); print("Clicked 'X'."); time.sleep(0.5); break
                 except: continue
            print("Finished checking for 'X' pop-ups.") # Confirmation
        except Exception as e: print(f"Error 'X' pop-up: {e}")
        # --- End Pop-up Handling ---

        # Scrape Main Post Content
        try:
            print("Scraping main post content...")
            post_content_xpath = "//div[contains(@class, 'article__bd__detail')] | //div[contains(@class, 'article__content')]"
            post_element = wait.until(EC.visibility_of_element_located((By.XPATH, post_content_xpath)))
            scraped_data["post_content"] = post_element.text.strip()
            print(f"Post content scraped (length: {len(scraped_data['post_content'])}).")
        except Exception as e_post:
            print(f"Error scraping post content: {e_post}")
            driver.save_screenshot(os.path.join(screenshot_dir,"error_post_scrape.png"))

        print("\n--- Starting scroll and comment extraction ---")
        last_height = driver.execute_script("return document.body.scrollHeight")
        no_new_content_strikes = 0

        for i in range(max_scroll_loops):
            print(f"--- Loop/Scroll attempt #{i+1} ---")
            initial_comment_count = len(unique_comment_texts_scraped)

            # 1. Click "查看N条回复"
            expand_reply_xpath = "//a[contains(text(), '查看') and contains(text(), '条回复')]"
            try:
                expand_buttons = driver.find_elements(By.XPATH, expand_reply_xpath)
                if expand_buttons:
                    print(f"Found {len(expand_buttons)} 'Expand Replies' links.")
                    for button_idx, button in enumerate(expand_buttons):
                        try:
                            if button.is_displayed():
                                print(f"  Clicking 'Expand Replies' #{button_idx+1}..."); driver.execute_script("arguments[0].scrollIntoView({block: 'center'});", button); time.sleep(0.3)
                                driver.execute_script("arguments[0].click();", button); time.sleep(1.5)
                        except StaleElementReferenceException: print("  Stale 'Expand Replies' link, skipping.")
                        except ElementNotInteractableException: print("  'Expand Replies' link not interactable, skipping.")
                        except Exception as e_expand: print(f"  Error clicking 'Expand Replies': {e_expand}")
            except Exception as e_find_expand: print(f"Could not search for 'Expand Replies' buttons: {e_find_expand}")

            # 2. Scroll down
            print("Scrolling down...")
            driver.execute_script("window.scrollTo(0, document.body.scrollHeight + 500);")
            time.sleep(scroll_pause_time)

            # 3. Attempt to scrape comments
            comment_text_xpath = "//div[@class='comment__item__main']/p"
            try:
                comment_p_tags = driver.find_elements(By.XPATH, comment_text_xpath)
                if comment_p_tags:
                    print(f"  Found {len(comment_p_tags)} potential comment <p> tags.")
                    new_comments_found_this_pass = 0
                    for p_tag in comment_p_tags:
                        try:
                            comment_text = p_tag.text.strip()
                            if comment_text and comment_text not in unique_comment_texts_scraped:
                                unique_comment_texts_scraped.add(comment_text)
                                new_comments_found_this_pass +=1
                        except StaleElementReferenceException: continue
                        except Exception as e_text: print(f"    Error getting text from a p_tag: {e_text}")
                    if new_comments_found_this_pass > 0: print(f"    Added {new_comments_found_this_pass} new unique comments.")
            except Exception as e_find: print(f"  Error finding comment <p> tags: {e_find}")

            # 4. Click "展开查看更多"
            load_more_comments_xpath = "//div[contains(@class,'more-comment') and (contains(., '展开查看更多') or contains(., '加载更多'))]"
            try:
                load_more_button = short_wait.until(EC.element_to_be_clickable((By.XPATH, load_more_comments_xpath)))
                print("  Found '展开查看更多/加载更多' button. Clicking..."); driver.execute_script("arguments[0].scrollIntoView({block: 'center'});", load_more_button); time.sleep(0.3)
                driver.execute_script("arguments[0].click();", load_more_button); print("  Clicked '展开查看更多/加载更多'."); time.sleep(scroll_pause_time)
            except TimeoutException: pass
            except Exception as e_load_more: print(f"  Error clicking '展开查看更多': {e_load_more}")

            # 5. Check for loop termination conditions
            current_height = driver.execute_script("return document.body.scrollHeight")
            print(f"  Current total comments scraped: {len(unique_comment_texts_scraped)}. Scroll height: {current_height}")
            if len(unique_comment_texts_scraped) > initial_comment_count: no_new_content_strikes = 0
            else: no_new_content_strikes += 1

            if current_height == last_height and no_new_content_strikes >= 2 :
                print("Scroll height unchanged and no new comments for 2 strikes. Assuming all loaded or stuck.")
                break
            elif no_new_content_strikes >= 3:
                print("No new comments found for 3 consecutive strikes. Assuming all loaded.")
                break
            last_height = current_height
            if i == max_scroll_loops -1 : print("Reached max scroll loops.")
            driver.save_screenshot(os.path.join(screenshot_dir,f"loop_end_{i+1}.png"))

        scraped_data["comments"] = list(unique_comment_texts_scraped)
        print(f"\n--- Finished comment scraping. Total unique comments: {len(scraped_data['comments'])} ---")

    except Exception as e:
        print(f"\n--- An critical error occurred ---")
        print(f"Error Type: {type(e).__name__}")
        print(f"Error Details: {e}")
        # --- CORRECTED INDENTATION FOR ERROR SCREENSHOT TRY-EXCEPT ---
        if driver:
            try:
                error_ss_path = os.path.join(screenshot_dir, "critical_error.png")
                driver.save_screenshot(error_ss_path)
                print(f"Saved critical error screenshot.")
            except Exception as e_ss_crit:
                 print(f"Could not save critical error screenshot: {e_ss_crit}")
        # --- END CORRECTION ---
    finally:
        if driver:
            print("Closing the browser...")
            driver.quit()
            print("Browser closed.")
    return scraped_data

# --- Main execution block ---
if __name__ == "__main__":
    target_url = "https://xueqiu.com/5669998349/334081638"
    print(f"--- Starting Scraper for URL: {target_url} ---")

    data = scrape_post_and_comments_on_scroll(target_url, max_scroll_loops=10, scroll_pause_time=3.0)

    print("\n" + "="*30); print("      Scraped Data Summary"); print("="*30)
    if data["post_content"]:
        print("\n--- Main Post ---")
        print(data["post_content"][:500] + "..." if len(data["post_content"]) > 500 else data["post_content"])
    else:
        print("\n>>> Main post content not scraped. <<<")

    if data["comments"]:
        print(f"\n--- Comments ({len(data['comments'])}) ---")
        for i, comment in enumerate(data["comments"][:20]): # Print first 20
            print(f"{i+1}. {comment[:150]}" + "..." if len(comment)>150 else comment)
        if len(data["comments"]) > 20:
            print(f"... and {len(data['comments']) - 20} more comments.")
    else:
        print("\n>>> No comments were scraped. <<<")
    print("\n" + "="*30)
    print(f"Check console logs and '{screenshot_dir}' folder for details.")

--- Starting Scraper for URL: https://xueqiu.com/5669998349/334081638 ---
Setting up WebDriver...
Navigating to: https://xueqiu.com/5669998349/334081638
Article body indicator loaded.
Looking for '跳过' pop-up...
'跳过' pop-up not found/timed out.
Looking for 'X' pop-up...
Finished checking for 'X' pop-ups.
Scraping main post content...
Post content scraped (length: 627).

--- Starting scroll and comment extraction ---
--- Loop/Scroll attempt #1 ---
Scrolling down...
  Found 18 potential comment <p> tags.
    Added 2 new unique comments.
  Current total comments scraped: 2. Scroll height: 4924
--- Loop/Scroll attempt #2 ---
Scrolling down...
  Found 18 potential comment <p> tags.
  Current total comments scraped: 2. Scroll height: 4924
--- Loop/Scroll attempt #3 ---
Scrolling down...
  Found 18 potential comment <p> tags.
  Current total comments scraped: 2. Scroll height: 4924
Scroll height unchanged and no new comments for 2 strikes. Assuming all loaded or stuck.

--- Finished comment sc

NameError: name 'screenshot_dir' is not defined

In [27]:
import time
import os
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import (
    TimeoutException,
    NoSuchElementException,
    ElementClickInterceptedException,
    StaleElementReferenceException,
    ElementNotInteractableException
)

def scrape_post_and_all_comments(url, max_main_loops=15, scroll_pause_time=2.5):
    print("Setting up WebDriver...")
    options = webdriver.ChromeOptions()
    # options.add_argument("--headless")
    options.add_argument("--disable-gpu"); options.add_argument("--no-sandbox")
    options.add_argument("--disable-dev-shm-usage"); options.add_argument("--window-size=1200,900")
    options.add_argument("--disable-notifications"); options.add_argument("--lang=zh-CN")
    options.add_argument("user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/100.0.4896.127 Safari/537.36")
    options.add_experimental_option('excludeSwitches', ['enable-logging'])

    screenshot_dir = "screenshots_post_all_comments"
    if not os.path.exists(screenshot_dir):
        try: os.makedirs(screenshot_dir); print(f"Created '{screenshot_dir}' directory.")
        except OSError as e: print(f"Error creating screenshot directory: {e}"); return {"post_content": None, "comments": []}

    driver = None
    scraped_data = {"post_content": None, "comments": []}
    unique_comment_texts_scraped = set()

    try:
        driver = webdriver.Chrome(options=options)
        driver.maximize_window()

        wait = WebDriverWait(driver, 20)
        short_wait = WebDriverWait(driver, 5) # Shorter wait for elements within loops

        print(f"Navigating to: {url}")
        driver.get(url)
        try:
            WebDriverWait(driver, 20).until(EC.presence_of_element_located((By.XPATH, "//div[contains(@class, 'article__bd__detail')] | //div[contains(@class, 'article__content')]")))
            print("Article body indicator loaded.")
        except TimeoutException:
            print("Article body indicator did not load."); driver.save_screenshot(os.path.join(screenshot_dir,"error_page_load.png")); return scraped_data
        time.sleep(2)

        # --- Handle Initial Pop-ups ---
        try:
            print("Looking for '跳过' pop-up..."); skip_xpath = "//span[text()='跳过'] | //button[contains(.,'跳过')]"; skip_button = WebDriverWait(driver, 10).until(EC.element_to_be_clickable((By.XPATH, skip_xpath)))
            driver.execute_script("arguments[0].click();", skip_button); print("Clicked '跳过'."); time.sleep(0.5)
        except TimeoutException: print("'跳过' pop-up not found/timed out.")
        except Exception as e: print(f"Error '跳过': {e}")
        try:
            print("Looking for 'X' pop-up..."); close_xpaths = [ "//div[contains(@class,'modal-wrapper')]//i[contains(@class,'icon-close')]", "//i[contains(@class, 'cube-dialog-close')]" ]
            for xpath in close_xpaths:
                 try: close_button = WebDriverWait(driver, 4).until(EC.element_to_be_clickable((By.XPATH, xpath))); driver.execute_script("arguments[0].click();", close_button); print("Clicked 'X'."); time.sleep(0.5); break
                 except: continue
            print("Finished checking for 'X' pop-ups.")
        except Exception as e: print(f"Error 'X' pop-up: {e}")

        # Scrape Main Post Content
        try:
            print("Scraping main post content...")
            post_content_xpath = "//div[contains(@class, 'article__bd__detail')] | //div[contains(@class, 'article__content')]"
            post_element = wait.until(EC.visibility_of_element_located((By.XPATH, post_content_xpath)))
            scraped_data["post_content"] = post_element.text.strip()
            print(f"Post content scraped (length: {len(scraped_data['post_content'])}).")
        except Exception as e_post:
            print(f"Error scraping post content: {e_post}")
            driver.save_screenshot(os.path.join(screenshot_dir,"error_post_scrape.png"))

        print("\n--- Starting scroll and comment extraction ---")
        last_height = driver.execute_script("return document.body.scrollHeight")
        no_new_actions_or_comments_strikes = 0

        for i in range(max_main_loops):
            print(f"--- Main Loop Iteration #{i+1} ---")
            action_taken_this_loop = False
            comments_found_before_interactions = len(unique_comment_texts_scraped)

            # 1. Click ALL visible "查看N条回复" (Expand Replies)
            expand_reply_xpath = "//a[contains(text(), '查看') and contains(text(), '条回复')]"
            # Loop to click these as new ones might appear after expanding others
            while True:
                clicked_an_expand_button_this_pass = False
                try:
                    expand_buttons = driver.find_elements(By.XPATH, expand_reply_xpath)
                    if not expand_buttons: # print("  No 'Expand Replies' links found this pass.");
                        break

                    # Filter only visible buttons before attempting to click
                    visible_expand_buttons = [b for b in expand_buttons if b.is_displayed()]
                    if not visible_expand_buttons: # print("  No *visible* 'Expand Replies' links.");
                        break

                    print(f"  Found {len(visible_expand_buttons)} visible 'Expand Replies' links to click.")
                    for button in visible_expand_buttons:
                        try:
                            # print("    Scrolling to 'Expand Replies' and clicking...")
                            driver.execute_script("arguments[0].scrollIntoView({block: 'center', behavior: 'smooth'});", button)
                            time.sleep(0.5) # Wait for scroll
                            # Use a short wait for clickability
                            button_to_click = short_wait.until(EC.element_to_be_clickable(button))
                            driver.execute_script("arguments[0].click();", button_to_click) # JS click
                            # button_to_click.click() # Regular click
                            print(f"    Clicked 'Expand Replies': {button.text[:20]}")
                            action_taken_this_loop = True
                            clicked_an_expand_button_this_pass = True
                            time.sleep(1.5) # Wait for replies to load
                            # After clicking, the DOM might change, so we might need to re-find buttons in the next 'while True' iteration
                        except StaleElementReferenceException: print("    Stale 'Expand Replies' link during click, will re-evaluate."); break # Break inner for to re-find
                        except ElementNotInteractableException: print("    'Expand Replies' link not interactable, might be covered or disabled.")
                        except TimeoutException: print("    Timeout waiting for 'Expand Replies' to be clickable.")
                        except Exception as e_expand: print(f"    Error clicking one 'Expand Replies': {e_expand}")
                    if not clicked_an_expand_button_this_pass: # No visible ones were successfully clicked
                        break
                except Exception as e_find_expand: print(f"  Error finding 'Expand Replies': {e_find_expand}"); break
                if not clicked_an_expand_button_this_pass: break # If loop completes without any clicks

            # 2. Scroll down
            print("  Scrolling down...")
            driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
            time.sleep(scroll_pause_time)

            # 3. Attempt to scrape comments
            comment_text_xpath = "//div[@class='comment__item__main']/p" # This XPath seems correct
            try:
                comment_p_tags = driver.find_elements(By.XPATH, comment_text_xpath)
                if comment_p_tags:
                    # print(f"  Found {len(comment_p_tags)} potential comment <p> tags on this pass.")
                    new_comments_this_pass = 0
                    for p_tag in comment_p_tags:
                        try:
                            comment_text = p_tag.text.strip()
                            if comment_text and comment_text not in unique_comment_texts_scraped:
                                unique_comment_texts_scraped.add(comment_text)
                                new_comments_this_pass += 1
                        except StaleElementReferenceException: continue
                        except Exception: continue # Ignore errors for individual comment text retrieval
                    if new_comments_this_pass > 0:
                        print(f"    Added {new_comments_this_pass} new unique comments this pass.")
                        action_taken_this_loop = True # Finding new comments is an action
            except Exception as e_find_comments: print(f"  Error finding comment <p> tags: {e_find_comments}")

            # 4. Click "展开查看更多" (Load More Main Comments)
            load_more_comments_xpath = "//div[contains(@class,'more-comment') and (contains(., '展开查看更多') or contains(., '加载更多'))]"
            try:
                # Use short_wait for this button as it might appear/disappear
                load_more_button = short_wait.until(EC.element_to_be_clickable((By.XPATH, load_more_comments_xpath)))
                print("  Found '展开查看更多/加载更多' button. Clicking...")
                driver.execute_script("arguments[0].scrollIntoView({block: 'center', behavior: 'smooth'});", load_more_button); time.sleep(0.3)
                driver.execute_script("arguments[0].click();", load_more_button)
                print("    Clicked '展开查看更多/加载更多'.")
                action_taken_this_loop = True
                time.sleep(scroll_pause_time + 1) # Wait longer after this action
            except TimeoutException: print("  '展开查看更多/加载更多' button not found or not clickable this pass.")
            except Exception as e_load_more: print(f"  Error clicking '展开查看更多': {e_load_more}")

            # 5. Check for loop termination conditions
            current_height = driver.execute_script("return document.body.scrollHeight")
            print(f"  Loop {i+1} end. Total unique comments: {len(unique_comment_texts_scraped)}. Scroll height: {current_height}. Last height: {last_height}")

            if not action_taken_this_loop and len(unique_comment_texts_scraped) == comments_found_before_interactions:
                no_new_actions_or_comments_strikes += 1
                print(f"  No new actions or comments strike: {no_new_actions_or_comments_strikes}")
            else:
                no_new_actions_or_comments_strikes = 0 # Reset if something happened

            if no_new_actions_or_comments_strikes >= 2:
                print("No new comments found and no interaction buttons successfully clicked for 2 consecutive loops. Assuming completion.")
                break

            last_height = current_height
            if i == max_main_loops - 1: print("Reached max main loops.")
            driver.save_screenshot(os.path.join(screenshot_dir,f"main_loop_end_{i+1}.png"))

        scraped_data["comments"] = list(unique_comment_texts_scraped)
        print(f"\n--- Finished comment scraping. Total unique comments: {len(scraped_data['comments'])} ---")

    except Exception as e:
        print(f"\n--- An critical error occurred ---")
        print(f"Error Type: {type(e).__name__}")
        print(f"Error Details: {e}")
        if driver:
            try:
                error_ss_path = os.path.join(screenshot_dir, "critical_error.png")
                driver.save_screenshot(error_ss_path)
                print(f"Saved critical error screenshot.")
            except Exception as e_ss_crit:
                 print(f"Could not save critical error screenshot: {e_ss_crit}")
    finally:
        if driver: print("Closing the browser..."); driver.quit(); print("Browser closed.")
    return scraped_data

# --- Main execution block ---
if __name__ == "__main__":
    target_url = "https://xueqiu.com/5669998349/334081638"
    print(f"--- Starting Scraper for URL: {target_url} ---")

    data = scrape_post_and_all_comments(target_url, max_main_loops=10, scroll_pause_time=2.5)

    print("\n" + "="*30); print("      Scraped Data Summary"); print("="*30)
    if data["post_content"]:
        print("\n--- Main Post ---")
        print(data["post_content"]) # Print full post content
    else:
        print("\n>>> Main post content not scraped. <<<")

    if data["comments"]:
        print(f"\n--- Comments ({len(data['comments'])}) ---")
        for i, comment in enumerate(data["comments"]):
            print(f"{i+1}. {comment}") # Print full comment
    else:
        print("\n>>> No comments were scraped. <<<")
    print("\n" + "="*30)
    print(f"Check console logs and folder for details.")

--- Starting Scraper for URL: https://xueqiu.com/5669998349/334081638 ---
Setting up WebDriver...
Created 'screenshots_post_all_comments' directory.
Navigating to: https://xueqiu.com/5669998349/334081638
Article body indicator loaded.
Looking for '跳过' pop-up...
'跳过' pop-up not found/timed out.
Looking for 'X' pop-up...
Finished checking for 'X' pop-ups.
Scraping main post content...
Post content scraped (length: 706).

--- Starting scroll and comment extraction ---
--- Main Loop Iteration #1 ---
  Scrolling down...
    Added 18 new unique comments this pass.
  '展开查看更多/加载更多' button not found or not clickable this pass.
  Loop 1 end. Total unique comments: 18. Scroll height: 4924. Last height: 4924
--- Main Loop Iteration #2 ---
  Scrolling down...
  '展开查看更多/加载更多' button not found or not clickable this pass.
  Loop 2 end. Total unique comments: 18. Scroll height: 4924. Last height: 4924
  No new actions or comments strike: 1
--- Main Loop Iteration #3 ---
  Scrolling down...
  '展开查看更多/加载

In [28]:
import time
import os
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import (
    TimeoutException,
    NoSuchElementException,
    ElementClickInterceptedException,
    StaleElementReferenceException,
    ElementNotInteractableException
)

def scrape_post_and_all_comments(url, max_main_loops=20, scroll_pause_time=2.5): # Increased max_main_loops
    print("Setting up WebDriver...")
    options = webdriver.ChromeOptions()
    # options.add_argument("--headless")
    options.add_argument("--disable-gpu"); options.add_argument("--no-sandbox")
    options.add_argument("--disable-dev-shm-usage"); options.add_argument("--window-size=1200,900")
    options.add_argument("--disable-notifications"); options.add_argument("--lang=zh-CN")
    options.add_argument("user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/100.0.4896.127 Safari/537.36")
    options.add_experimental_option('excludeSwitches', ['enable-logging'])

    screenshot_dir = "screenshots_post_all_comments"
    if not os.path.exists(screenshot_dir):
        try: os.makedirs(screenshot_dir); print(f"Created '{screenshot_dir}' directory.")
        except OSError as e: print(f"Error creating screenshot directory: {e}"); return {"post_content": None, "comments": []}

    driver = None
    scraped_data = {"post_content": None, "comments": []}
    unique_comment_texts_scraped = set()

    try:
        driver = webdriver.Chrome(options=options)
        driver.maximize_window()

        wait = WebDriverWait(driver, 20)
        # Shorter wait for elements that appear/disappear within loops
        interaction_wait = WebDriverWait(driver, 7)

        print(f"Navigating to: {url}")
        driver.get(url)
        try:
            WebDriverWait(driver, 20).until(EC.presence_of_element_located((By.XPATH, "//div[contains(@class, 'article__bd__detail')] | //div[contains(@class, 'article__content')]")))
            print("Article body indicator loaded.")
        except TimeoutException:
            print("Article body indicator did not load."); driver.save_screenshot(os.path.join(screenshot_dir,"error_page_load.png")); return scraped_data
        time.sleep(2)

        # --- Handle Initial Pop-ups ---
        try:
            print("Looking for '跳过' pop-up..."); skip_xpath = "//span[text()='跳过'] | //button[contains(.,'跳过')]"; skip_button = WebDriverWait(driver, 10).until(EC.element_to_be_clickable((By.XPATH, skip_xpath)))
            driver.execute_script("arguments[0].click();", skip_button); print("Clicked '跳过'."); time.sleep(0.5)
        except TimeoutException: print("'跳过' pop-up not found/timed out.")
        except Exception as e: print(f"Error '跳过': {e}")
        try:
            print("Looking for 'X' pop-up..."); close_xpaths = [ "//div[contains(@class,'modal-wrapper')]//i[contains(@class,'icon-close')]", "//i[contains(@class, 'cube-dialog-close')]" ]
            for xpath in close_xpaths:
                 try: close_button = WebDriverWait(driver, 4).until(EC.element_to_be_clickable((By.XPATH, xpath))); driver.execute_script("arguments[0].click();", close_button); print("Clicked 'X'."); time.sleep(0.5); break
                 except: continue
            print("Finished checking for 'X' pop-ups.")
        except Exception as e: print(f"Error 'X' pop-up: {e}")

        # Scrape Main Post Content
        try:
            print("Scraping main post content...")
            post_content_xpath = "//div[contains(@class, 'article__bd__detail')] | //div[contains(@class, 'article__content')]"
            post_element = wait.until(EC.visibility_of_element_located((By.XPATH, post_content_xpath)))
            scraped_data["post_content"] = post_element.text.strip()
            print(f"Post content scraped (length: {len(scraped_data['post_content'])}).")
        except Exception as e_post:
            print(f"Error scraping post content: {e_post}")
            driver.save_screenshot(os.path.join(screenshot_dir,"error_post_scrape.png"))

        print("\n--- Starting scroll and comment extraction ---")
        last_total_comments = -1 # Initialize to a value different from initial count

        for i in range(max_main_loops):
            print(f"--- Main Loop Iteration #{i+1} ---")
            action_taken_this_loop = False
            comments_at_loop_start = len(unique_comment_texts_scraped)

            # 1. Click ALL visible "查看N条回复" (Expand Replies)
            expand_reply_xpath = "//a[contains(text(), '查看') and contains(text(), '条回复')]"
            # Inner loop to keep clicking expand replies as long as new ones appear or are clickable
            expand_attempts = 0
            while expand_attempts < 5: # Limit attempts to avoid infinite loop if something goes wrong
                expand_attempts += 1
                clicked_an_expand_button_this_pass = False
                try:
                    # Re-find elements each time as DOM changes
                    visible_expand_buttons = [b for b in driver.find_elements(By.XPATH, expand_reply_xpath) if b.is_displayed()]
                    if not visible_expand_buttons: break # No more visible expand buttons

                    print(f"  Found {len(visible_expand_buttons)} visible 'Expand Replies' links.")
                    for button in visible_expand_buttons:
                        try:
                            driver.execute_script("arguments[0].scrollIntoView({block: 'center', behavior: 'smooth'});", button); time.sleep(0.4)
                            # Using a short wait for the specific button to ensure it's ready
                            button_to_click = WebDriverWait(driver, 3).until(EC.element_to_be_clickable(button))
                            driver.execute_script("arguments[0].click();", button_to_click)
                            print(f"    Clicked 'Expand Replies': {button.text[:20]}")
                            action_taken_this_loop = True
                            clicked_an_expand_button_this_pass = True
                            time.sleep(1.5) # Wait for replies
                        except (StaleElementReferenceException, TimeoutException, ElementNotInteractableException): continue # Try next button or re-evaluate
                        except Exception as e_expand: print(f"    Error clicking one 'Expand Replies': {e_expand}")
                    if not clicked_an_expand_button_this_pass: break # No more were clicked this pass
                except Exception as e_find_expand: print(f"  Error finding 'Expand Replies': {e_find_expand}"); break


            # 2. Scroll down (helps reveal "Load More" and more comments)
            print("  Scrolling down...")
            driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
            time.sleep(scroll_pause_time) # Allow content to load after scroll

            # 3. Attempt to scrape comments (after potential expansions and scroll)
            comment_text_xpath = "//div[@class='comment__item__main']/p"
            try:
                comment_p_tags = driver.find_elements(By.XPATH, comment_text_xpath)
                new_comments_this_pass = 0
                for p_tag in comment_p_tags:
                    try:
                        comment_text = p_tag.text.strip()
                        if comment_text and comment_text not in unique_comment_texts_scraped:
                            unique_comment_texts_scraped.add(comment_text)
                            new_comments_this_pass += 1
                    except StaleElementReferenceException: continue
                if new_comments_this_pass > 0:
                    print(f"    Added {new_comments_this_pass} new unique comments from page.")
                    action_taken_this_loop = True
            except Exception as e_find_comments: print(f"  Error finding comment <p> tags: {e_find_comments}")


            # 4. Click "展开查看更多" (Load More Main Comments)
            # UPDATED XPATH based on your screenshot:
            load_more_comments_xpath = "//a[@class='show_more' and .//span[text()='展开查看更多']]"
            try:
                # Use interaction_wait for this button
                load_more_button = interaction_wait.until(EC.element_to_be_clickable((By.XPATH, load_more_comments_xpath)))
                print("  Found '展开查看更多' button. Clicking...")
                driver.execute_script("arguments[0].scrollIntoView({block: 'center', behavior: 'smooth'});", load_more_button); time.sleep(0.4)
                driver.execute_script("arguments[0].click();", load_more_button)
                print("    Clicked '展开查看更多'.")
                action_taken_this_loop = True
                time.sleep(scroll_pause_time + 0.5) # Wait longer after this significant action
            except TimeoutException:
                print("  '展开查看更多' button not found or not clickable this pass (might be all loaded).")
            except Exception as e_load_more:
                print(f"  Error clicking '展开查看更多': {e_load_more}")


            # 5. Check for loop termination conditions
            current_total_comments = len(unique_comment_texts_scraped)
            print(f"  Loop {i+1} end. Total unique comments: {current_total_comments}. Previously: {last_total_comments}")

            if not action_taken_this_loop and current_total_comments == last_total_comments:
                print("No actions taken (no buttons clicked, no new comments found) and comment count unchanged. Assuming completion.")
                break
            
            last_total_comments = current_total_comments
            if i == max_main_loops - 1: print("Reached max main loops.")
            driver.save_screenshot(os.path.join(screenshot_dir,f"main_loop_end_{i+1}.png"))

        scraped_data["comments"] = list(unique_comment_texts_scraped)
        print(f"\n--- Finished comment scraping. Total unique comments: {len(scraped_data['comments'])} ---")

    except Exception as e:
        print(f"\n--- An critical error occurred ---")
        print(f"Error Type: {type(e).__name__}")
        print(f"Error Details: {e}")
        if driver:
            try:
                error_ss_path = os.path.join(screenshot_dir, "critical_error.png")
                driver.save_screenshot(error_ss_path)
                print(f"Saved critical error screenshot.")
            except Exception as e_ss_crit:
                 print(f"Could not save critical error screenshot: {e_ss_crit}")
    finally:
        if driver: print("Closing the browser..."); driver.quit(); print("Browser closed.")
    return scraped_data

# --- Main execution block ---
if __name__ == "__main__":
    target_url = "https://xueqiu.com/5669998349/334081638"
    print(f"--- Starting Scraper for URL: {target_url} ---")

    data = scrape_post_and_all_comments(target_url, max_main_loops=20, scroll_pause_time=2.5) # Increased max_main_loops

    print("\n" + "="*30); print("      Scraped Data Summary"); print("="*30)
    if data["post_content"]:
        print("\n--- Main Post ---"); print(data["post_content"])
    else: print("\n>>> Main post content not scraped. <<<")

    if data["comments"]:
        print(f"\n--- Comments ({len(data['comments'])}) ---")
        for i, comment in enumerate(data["comments"]): print(f"{i+1}. {comment}")
    else: print("\n>>> No comments were scraped. <<<")
    print("\n" + "="*30); print(f"Check console logs and folder for details.")

--- Starting Scraper for URL: https://xueqiu.com/5669998349/334081638 ---
Setting up WebDriver...
Navigating to: https://xueqiu.com/5669998349/334081638
Article body indicator loaded.
Looking for '跳过' pop-up...
'跳过' pop-up not found/timed out.
Looking for 'X' pop-up...
Finished checking for 'X' pop-ups.
Scraping main post content...
Post content scraped (length: 706).

--- Starting scroll and comment extraction ---
--- Main Loop Iteration #1 ---
  Scrolling down...
    Added 18 new unique comments from page.
  Found '展开查看更多' button. Clicking...
    Clicked '展开查看更多'.
  Loop 1 end. Total unique comments: 18. Previously: -1
--- Main Loop Iteration #2 ---
  Scrolling down...
    Added 15 new unique comments from page.
  '展开查看更多' button not found or not clickable this pass (might be all loaded).
  Loop 2 end. Total unique comments: 33. Previously: 18
--- Main Loop Iteration #3 ---
  Scrolling down...
  '展开查看更多' button not found or not clickable this pass (might be all loaded).
  Loop 3 end.

In [5]:
import time
import os
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import (
    TimeoutException,
    NoSuchElementException,
    ElementClickInterceptedException,
    StaleElementReferenceException,
    ElementNotInteractableException
)

def scrape_post_and_all_comments(url, max_main_loops=20, scroll_pause_time=2.5):
    print("Setting up WebDriver...")
    options = webdriver.ChromeOptions()
    # options.add_argument("--headless")
    options.add_argument("--disable-gpu"); options.add_argument("--no-sandbox")
    options.add_argument("--disable-dev-shm-usage"); options.add_argument("--window-size=1200,900")
    options.add_argument("--disable-notifications"); options.add_argument("--lang=zh-CN")
    options.add_argument("user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/100.0.4896.127 Safari/537.36")
    options.add_experimental_option('excludeSwitches', ['enable-logging'])

    screenshot_dir = "screenshots_post_all_comments"
    if not os.path.exists(screenshot_dir):
        try: os.makedirs(screenshot_dir); print(f"Created '{screenshot_dir}' directory.")
        except OSError as e: print(f"Error creating screenshot directory: {e}"); return {"post_content": None, "post_author_id": None, "comments": []}

    driver = None
    # Updated scraped_data structure
    scraped_data = {"post_content": None, "post_author_id": None, "comments": []}
    unique_comment_texts_scraped = set() # Still used to track uniqueness of comment TEXTS

    try:
        driver = webdriver.Chrome(options=options)
        driver.maximize_window()

        wait = WebDriverWait(driver, 20)
        interaction_wait = WebDriverWait(driver, 7)

        print(f"Navigating to: {url}")
        driver.get(url)
        try:
            WebDriverWait(driver, 20).until(EC.presence_of_element_located((By.XPATH, "//div[contains(@class, 'article__bd__detail')] | //div[contains(@class, 'article__content')]")))
            print("Article body indicator loaded.")
        except TimeoutException:
            print("Article body indicator did not load."); driver.save_screenshot(os.path.join(screenshot_dir,"error_page_load.png")); return scraped_data
        time.sleep(2)

        # --- Handle Initial Pop-ups ---
        try:
            print("Looking for '跳过' pop-up..."); skip_xpath = "//span[text()='跳过'] | //button[contains(.,'跳过')]"; skip_button = WebDriverWait(driver, 10).until(EC.element_to_be_clickable((By.XPATH, skip_xpath)))
            driver.execute_script("arguments[0].click();", skip_button); print("Clicked '跳过'."); time.sleep(0.5)
        except TimeoutException: print("'跳过' pop-up not found/timed out.")
        except Exception as e: print(f"Error '跳过': {e}")
        try:
            print("Looking for 'X' pop-up..."); close_xpaths = [ "//div[contains(@class,'modal-wrapper')]//i[contains(@class,'icon-close')]", "//i[contains(@class, 'cube-dialog-close')]" ]
            for xpath in close_xpaths:
                 try: close_button = WebDriverWait(driver, 4).until(EC.element_to_be_clickable((By.XPATH, xpath))); driver.execute_script("arguments[0].click();", close_button); print("Clicked 'X'."); time.sleep(0.5); break
                 except: continue
            print("Finished checking for 'X' pop-ups.")
        except Exception as e: print(f"Error 'X' pop-up: {e}")

        # Scrape Main Post Content and Author ID
        try:
            print("Scraping main post content and author ID...")
            post_content_xpath = "//div[contains(@class, 'article__bd__detail')] | //div[contains(@class, 'article__content')]"
            post_element = wait.until(EC.visibility_of_element_located((By.XPATH, post_content_xpath)))
            scraped_data["post_content"] = post_element.text.strip()
            print(f"Post content scraped (length: {len(scraped_data['post_content'])}).")
            try:
                post_author_link_xpath = "//div[contains(@class, 'article__author')]//a[@data-tooltip and starts-with(@href, '/')]"
                author_link_element = wait.until(EC.presence_of_element_located((By.XPATH, post_author_link_xpath)))
                author_id_val = author_link_element.get_attribute('data-tooltip')
                if author_id_val and author_id_val.isdigit():
                    scraped_data["post_author_id"] = author_id_val
                    print(f"Post author ID scraped: {author_id_val}")
                else:
                    href = author_link_element.get_attribute('href')
                    if href: potential_id = href.split('/')[-1]
                    if potential_id.isdigit(): scraped_data["post_author_id"] = potential_id; print(f"Post author ID scraped from href: {potential_id}")
            except TimeoutException: print("Post author link (for ID) not found.")
            except Exception as e_author: print(f"Error scraping post author ID: {e_author}")
        except Exception as e_post:
            print(f"Error scraping post content/author: {e_post}")
            driver.save_screenshot(os.path.join(screenshot_dir,"error_post_scrape.png"))

        print("\n--- Starting scroll and comment extraction ---")
        last_total_unique_texts_count = -1 # Based on unique_comment_texts_scraped

        for i in range(max_main_loops):
            print(f"--- Main Loop Iteration #{i+1} ---")
            action_taken_this_loop = False
            # comments_at_loop_start = len(unique_comment_texts_scraped) # original logic used len of this set

            # 1. Click ALL visible "查看N条回复" (Expand Replies)
            expand_reply_xpath = "//a[contains(text(), '查看') and contains(text(), '条回复') and not(ancestor::div[contains(@style,'display: none')]) and not(ancestor::div[contains(@class,'hide')])]"
            expand_attempts = 0
            while expand_attempts < 5:
                expand_attempts += 1
                clicked_an_expand_button_this_pass = False
                try:
                    visible_expand_buttons = [b for b in driver.find_elements(By.XPATH, expand_reply_xpath) if b.is_displayed() and b.is_enabled()]
                    if not visible_expand_buttons:
                        if expand_attempts > 1: print(f"  No more 'Expand Replies' visible in sub-attempt {expand_attempts}.")
                        break
                    print(f"  Found {len(visible_expand_buttons)} visible 'Expand Replies' links (Attempt {expand_attempts}).")
                    for button in visible_expand_buttons:
                        try:
                            if not button.is_displayed() or not button.is_enabled(): continue # Re-check
                            driver.execute_script("arguments[0].scrollIntoView({block: 'center', behavior: 'smooth'});", button); time.sleep(0.4)
                            button_to_click = WebDriverWait(driver, 3).until(EC.element_to_be_clickable(button))
                            driver.execute_script("arguments[0].click();", button_to_click)
                            print(f"    Clicked 'Expand Replies': {button.text[:30]}...")
                            action_taken_this_loop = True
                            clicked_an_expand_button_this_pass = True
                            time.sleep(1.5) # Wait for replies
                        except (StaleElementReferenceException, TimeoutException, ElementNotInteractableException): continue
                        except Exception as e_expand: print(f"    Error clicking one 'Expand Replies': {e_expand}")
                    if not clicked_an_expand_button_this_pass and expand_attempts > 1 : break # No more were clicked this pass
                    if clicked_an_expand_button_this_pass : time.sleep(0.5) # Short pause if something was clicked
                except Exception as e_find_expand: print(f"  Error finding 'Expand Replies': {e_find_expand}"); break

            # 2. Scroll down (helps reveal "Load More" and more comments)
            print("  Scrolling down...")
            driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
            time.sleep(scroll_pause_time)

            # 3. Attempt to scrape comments (text and author ID)
            # XPath for comment text p tags, combining original and a variant for replies
            comment_p_tag_xpath = "//div[@class='comment__item__main']/p | //div[contains(@class, 'comment__content')]/p"
            try:
                comment_p_tags = driver.find_elements(By.XPATH, comment_p_tag_xpath)
                new_comments_added_to_list_this_pass = 0
                
                if comment_p_tags:
                    print(f"    Found {len(comment_p_tags)} potential comment <p> tags.")

                for p_tag in comment_p_tags:
                    comment_text = ""
                    author_id = None
                    try:
                        comment_text = p_tag.text.strip()
                        if not comment_text: # Skip if p_tag is empty after strip
                            continue

                        if comment_text not in unique_comment_texts_scraped:
                            unique_comment_texts_scraped.add(comment_text) # Add to set for uniqueness check
                            
                            # Find author ID related to this p_tag
                            try:
                                # Navigate up to the common ancestor 'comment_item' or 'reply_item'
                                # This XPath tries to find the closest div ancestor that contains 'comment_item' in its class
                                comment_block_ancestor = p_tag.find_element(By.XPATH, "./ancestor::div[contains(@class, 'comment_item')][1]")
                                
                                try:
                                    # From this ancestor, find the author link (e.g., <a data-tooltip="ID" class="avatar">)
                                    author_link_el = comment_block_ancestor.find_element(By.XPATH, ".//a[@data-tooltip and starts-with(@href, '/')]")
                                    temp_author_id = author_link_el.get_attribute('data-tooltip')
                                    if temp_author_id and temp_author_id.isdigit():
                                        author_id = temp_author_id
                                    else: 
                                        href = author_link_el.get_attribute('href')
                                        if href:
                                            potential_id_from_href = href.split('/')[-1]
                                            if potential_id_from_href.isdigit():
                                                author_id = potential_id_from_href
                                except NoSuchElementException:
                                    # print(f"      Author ID link not found for comment text: '{comment_text[:30]}...'")
                                    pass # Author ID will remain None
                            except NoSuchElementException:
                                # print(f"      Could not find 'comment_item' ancestor for text: '{comment_text[:30]}...'")
                                pass # Author ID will remain None

                            scraped_data["comments"].append({"text": comment_text, "author_id": author_id})
                            new_comments_added_to_list_this_pass += 1
                            
                    except StaleElementReferenceException:
                        # print("      Stale p_tag encountered.")
                        continue 
                    except Exception as e_proc_p:
                        print(f"      Error processing one p_tag for comment: {e_proc_p}")
                
                if new_comments_added_to_list_this_pass > 0:
                    print(f"    Added {new_comments_added_to_list_this_pass} new unique comments to data list.")
                    action_taken_this_loop = True # Mark action if new comments were actually added to the list
            except Exception as e_find_comments:
                print(f"  Error finding comment <p> tags: {e_find_comments}")

            # 4. Click "展开查看更多" (Load More Main Comments)
            load_more_comments_xpath = "//a[@class='show_more' and .//span[text()='展开查看更多']]"
            try:
                load_more_button = interaction_wait.until(EC.element_to_be_clickable((By.XPATH, load_more_comments_xpath)))
                print("  Found '展开查看更多' button. Clicking...")
                driver.execute_script("arguments[0].scrollIntoView({block: 'center', behavior: 'smooth'});", load_more_button); time.sleep(0.4)
                driver.execute_script("arguments[0].click();", load_more_button)
                print("    Clicked '展开查看更多'.")
                action_taken_this_loop = True
                time.sleep(scroll_pause_time + 0.5) # Wait longer
            except TimeoutException:
                print("  '展开查看更多' button not found or not clickable this pass (might be all loaded).")
            except Exception as e_load_more:
                print(f"  Error clicking '展开查看更多': {e_load_more}")

            # 5. Check for loop termination conditions
            current_unique_texts_count = len(unique_comment_texts_scraped)
            print(f"  Loop {i+1} end. Total unique comment texts scraped: {current_unique_texts_count}. Previously: {last_total_unique_texts_count}. Total comments in list: {len(scraped_data['comments'])}")

            if not action_taken_this_loop and current_unique_texts_count == last_total_unique_texts_count:
                # Ensure we don't break on the very first loop if last_total_unique_texts_count is still -1 and no actions taken
                if last_total_unique_texts_count != -1 or i > 0 : 
                    print("No actions taken and unique comment text count unchanged. Assuming completion.")
                    break
            
            last_total_unique_texts_count = current_unique_texts_count
            if i == max_main_loops - 1: print("Reached max main loops.")
            driver.save_screenshot(os.path.join(screenshot_dir,f"main_loop_end_{i+1}.png"))

        # No need to convert unique_comment_texts_scraped to list, as scraped_data["comments"] is already the list of dicts
        print(f"\n--- Finished comment scraping. Total comments in list: {len(scraped_data['comments'])} ---")

    except Exception as e:
        print(f"\n--- An critical error occurred ---")
        print(f"Error Type: {type(e).__name__}")
        print(f"Error Details: {e}")
        if driver:
            try:
                error_ss_path = os.path.join(screenshot_dir, "critical_error.png")
                driver.save_screenshot(error_ss_path)
                print(f"Saved critical error screenshot.")
            except Exception as e_ss_crit:
                 print(f"Could not save critical error screenshot: {e_ss_crit}")
    finally:
        if driver: print("Closing the browser..."); driver.quit(); print("Browser closed.")
    return scraped_data

# --- Main execution block ---
if __name__ == "__main__":
    target_url = "https://xueqiu.com/5669998349/334081638"
    # For testing with a page known to have many comments and replies:
    # target_url = "https://xueqiu.com/1929796349/272374603" # Example with many comments
    print(f"--- Starting Scraper for URL: {target_url} ---")
    
    # Accessing the default value of screenshot_dir for the final print message.
    # Better would be to return it or make it a global constant if needed outside.
    global_screenshot_dir_name = scrape_post_and_all_comments.__defaults__[0]


    data = scrape_post_and_all_comments(target_url, max_main_loops=20, scroll_pause_time=2.5)

    print("\n" + "="*30); print("      Scraped Data Summary"); print("="*30)
    if data["post_content"]:
        print("\n--- Main Post ---")
        print(f"Author ID: {data.get('post_author_id', 'N/A')}")
        print(data["post_content"])
    else: print("\n>>> Main post content not scraped. <<<")

    if data["comments"]:
        print(f"\n--- Comments ({len(data['comments'])}) ---")
        for i, comment_data in enumerate(data["comments"]):
            print(f"{i+1}. Author ID: {comment_data.get('author_id', 'N/A')}, Comment: {comment_data['text']}")
    else: print("\n>>> No comments were scraped. <<<")
    print("\n" + "="*30); print(f"Check console logs and '{global_screenshot_dir_name}' folder for details.")

--- Starting Scraper for URL: https://xueqiu.com/5669998349/334081638 ---
Setting up WebDriver...
Navigating to: https://xueqiu.com/5669998349/334081638
Article body indicator loaded.
Looking for '跳过' pop-up...
'跳过' pop-up not found/timed out.
Looking for 'X' pop-up...
Finished checking for 'X' pop-ups.
Scraping main post content and author ID...
Post content scraped (length: 706).
Post author ID scraped: 5669998349

--- Starting scroll and comment extraction ---
--- Main Loop Iteration #1 ---
  Scrolling down...
    Found 19 potential comment <p> tags.
    Added 19 new unique comments to data list.
  Found '展开查看更多' button. Clicking...
    Clicked '展开查看更多'.
  Loop 1 end. Total unique comment texts scraped: 19. Previously: -1. Total comments in list: 19
--- Main Loop Iteration #2 ---
  Scrolling down...
    Found 33 potential comment <p> tags.
    Added 14 new unique comments to data list.
  '展开查看更多' button not found or not clickable this pass (might be all loaded).
  Loop 2 end. Total 

In [21]:
import time
import os
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import (
    TimeoutException,
    NoSuchElementException,
    ElementClickInterceptedException,
    StaleElementReferenceException,
    ElementNotInteractableException
)

def scrape_main_feed_post_ids(url="https://xueqiu.com/", num_ids_to_get=3):
    print(f"Setting up WebDriver for {url}...")
    options = webdriver.ChromeOptions()
    # options.add_argument("--headless")
    options.add_argument("--disable-gpu"); options.add_argument("--no-sandbox")
    options.add_argument("--disable-dev-shm-usage"); options.add_argument("--window-size=1200,900")
    options.add_argument("--disable-notifications"); options.add_argument("--lang=zh-CN")
    options.add_argument("user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/100.0.4896.127 Safari/537.36")
    options.add_experimental_option('excludeSwitches', ['enable-logging'])

    screenshot_dir = "screenshots_main_feed"
    if not os.path.exists(screenshot_dir):
        try: os.makedirs(screenshot_dir); print(f"Created '{screenshot_dir}' directory.")
        except OSError as e: print(f"Error creating screenshot directory: {e}"); return []

    driver = None
    post_ids = []
    # Using a set for unique_post_ids to ensure we don't add duplicates if multiple links point to the same post
    unique_post_ids = set()


    try:
        driver = webdriver.Chrome(options=options)
        driver.maximize_window()
        wait = WebDriverWait(driver, 20) 
        interaction_wait = WebDriverWait(driver, 10)

        print(f"Navigating to: {url}")
        driver.get(url)
        try:
            WebDriverWait(driver, 20).until(
                EC.presence_of_element_located((By.XPATH, "//article[contains(@class, 'timeline__item')] | //div[contains(@class, 'home_timeline')] | //div[contains(@class, 'list__container')]")) # Added another possible container
            )
            print("Main feed container indicator loaded.")
        except TimeoutException:
            print("Main feed container indicator did not load."); driver.save_screenshot(os.path.join(screenshot_dir,"error_page_load.png")); return []
        time.sleep(3) 

        # --- Handle Initial Pop-ups ---
        try:
            print("Looking for '跳过' (Skip) pop-up..."); skip_xpath = "//span[text()='跳过'] | //button[contains(.,'跳过')]"; skip_button = WebDriverWait(driver, 7).until(EC.element_to_be_clickable((By.XPATH, skip_xpath))) # Shorter wait for popups
            driver.execute_script("arguments[0].click();", skip_button); print("Clicked '跳过'."); time.sleep(0.5)
        except TimeoutException: print("'跳过' pop-up not found/timed out.")
        except Exception as e: print(f"Error clicking '跳过': {e}")
        try:
            print("Looking for 'X' pop-up (modal close)..."); close_xpaths = [ "//div[contains(@class,'modal-wrapper')]//i[contains(@class,'icon-close')]", "//i[contains(@class, 'cube-dialog-close')]", "//div[@class='optional_tip']/i[@class='icon-close']" ]
            for xpath_item in close_xpaths:
                 try: close_button = WebDriverWait(driver, 3).until(EC.element_to_be_clickable((By.XPATH, xpath_item))); driver.execute_script("arguments[0].click();", close_button); print(f"Clicked 'X' for pop-up using: {xpath_item}"); time.sleep(0.5); break
                 except: continue
            print("Finished checking for 'X' pop-ups.")
        except Exception as e: print(f"Error 'X' pop-up: {e}")
        
        time.sleep(1)

        print(f"\n--- Attempting to scrape first {num_ids_to_get} post IDs ---")
        article_xpath = "//article[contains(@class, 'timeline__item') or contains(@class, 'home__timeline__item')]"
        
        attempts = 0
        max_scroll_attempts = 7 # Increased scroll attempts

        while len(unique_post_ids) < num_ids_to_get and attempts < max_scroll_attempts:
            attempts += 1
            print(f"Attempt #{attempts} to find articles...")
            
            article_elements = driver.find_elements(By.XPATH, article_xpath)
            print(f"  Found {len(article_elements)} article elements on page in attempt {attempts}.")

            if not article_elements and attempts == 1:
                print("  No articles found on first pass, waiting a bit more...")
                time.sleep(5)
                article_elements = driver.find_elements(By.XPATH, article_xpath)
                print(f"  Found {len(article_elements)} article elements after extra wait.")

            for article_idx, article in enumerate(article_elements):
                if len(unique_post_ids) >= num_ids_to_get:
                    break
                try:
                    link_element = None
                    href_value = None
                    
                    # Define XPaths to try for the link
                    # Path from your screenshot: <a class="style_fake-anchor_2cg fake-anchor" href="/USERID/POSTID">
                    # It is inside <div class="style_timeline_item_content_38K">
                    # which is inside <div class="style_timeline_item_main_1HD">
                    # which is inside <article class="style_timeline__item_3WW">
                    
                    # Most specific based on your screenshot's structure
                    xpath_for_link_v1 = ".//div[contains(@class, 'timeline_item_content')]//a[contains(@class, 'fake-anchor_') and starts-with(@href, '/')]"
                    # General fake-anchor
                    xpath_for_link_v2 = ".//a[contains(@class, 'fake-anchor_') and starts-with(@href, '/')]"
                    # More general link that has an h3 title, usually a main post link
                    xpath_for_link_v3 = ".//a[starts-with(@href, '/') and not(contains(@class, 'avatar')) and .//h3]"


                    link_xpaths_to_try = [xpath_for_link_v1, xpath_for_link_v2, xpath_for_link_v3]
                    
                    for i_xpath, link_xpath in enumerate(link_xpaths_to_try):
                        try:
                            link_element = article.find_element(By.XPATH, link_xpath)
                            if link_element:
                                print(f"    Article {article_idx}: Found link using XPath variant {i_xpath+1}")
                                break 
                        except NoSuchElementException:
                            if i_xpath == len(link_xpaths_to_try) -1: # If last attempt failed
                                print(f"    Article {article_idx}: All XPath variants for link failed.")
                            continue # Try next XPath

                    if link_element:
                        href_value = link_element.get_attribute('href')
                        if href_value:
                            print(f"    Article {article_idx}: Link href: {href_value}")
                            parts = href_value.strip('/').split('/')
                            if len(parts) == 2: 
                                user_id, post_id_str = parts[0], parts[1]
                                if post_id_str.isdigit():
                                    if post_id_str not in unique_post_ids:
                                        print(f"      Extracted NEW Post ID: {post_id_str} (User ID: {user_id})")
                                        unique_post_ids.add(post_id_str)
                                    # else:
                                        # print(f"      Post ID {post_id_str} already found.")
                                    if len(unique_post_ids) >= num_ids_to_get:
                                        break
                                else:
                                    print(f"      Post ID part '{post_id_str}' is not numeric from href '{href_value}'.")
                            else:
                                print(f"      Href '{href_value}' does not have '/user_id/post_id' structure after splitting.")
                        else:
                            print(f"    Article {article_idx}: Link found but href is empty.")
                    # If link_element is still None after trying all XPaths, it will be handled by the outer loop condition or next scroll.

                except Exception as e_article:
                    print(f"    Error processing article {article_idx}: {e_article}")

            if len(unique_post_ids) < num_ids_to_get:
                if len(article_elements) > 0 :
                    print(f"  Still need more IDs ({len(unique_post_ids)}/{num_ids_to_get}). Scrolling down...")
                    driver.execute_script("window.scrollTo(0, document.body.scrollHeight);") 
                    time.sleep(3.5) 
                else:
                    print("  No articles found on this scroll pass, or all processed.")
                    # if attempts > 2 and not unique_post_ids : # If few attempts and still no IDs, might be an issue
                    #    print("  Breaking scroll attempts as no articles seem to be loading or matching.")
                    #    break
            
        if not unique_post_ids:
            print("Could not find any post IDs after all attempts. Saving a screenshot.")
            driver.save_screenshot(os.path.join(screenshot_dir, "no_post_ids_found.png"))
        else:
            post_ids = list(unique_post_ids)


    except Exception as e:
        print(f"\n--- An critical error occurred ---"); print(f"Error Type: {type(e).__name__}"); print(f"Error Details: {e}")
        if driver:
            try: error_ss_path = os.path.join(screenshot_dir, "critical_error_main_feed.png"); driver.save_screenshot(error_ss_path); print(f"Saved critical error screenshot to {error_ss_path}")
            except Exception as e_ss_crit: print(f"Could not save critical error screenshot: {e_ss_crit}")
    finally:
        if driver: print("Closing the browser..."); driver.quit(); print("Browser closed.")
    
    return post_ids[:num_ids_to_get]

if __name__ == "__main__":
    print(f"--- Starting Scraper for Main Xueqiu Feed Post IDs ---")
    extracted_ids = scrape_main_feed_post_ids(num_ids_to_get=3)
    print("\n" + "="*30); print("      Scraped Post IDs"); print("="*30)
    if extracted_ids:
        print(f"Successfully scraped {len(extracted_ids)} post IDs:")
        for i, pid in enumerate(extracted_ids): print(f"{i+1}. {pid}")
    else: print("\n>>> No post IDs were scraped. <<<")
    print("\n" + "="*30); print(f"Check console logs and the 'screenshots_main_feed' folder for details.")

--- Starting Scraper for Main Xueqiu Feed Post IDs ---
Setting up WebDriver for https://xueqiu.com/...
Navigating to: https://xueqiu.com/
Main feed container indicator loaded.
Looking for '跳过' (Skip) pop-up...
'跳过' pop-up not found/timed out.
Looking for 'X' pop-up (modal close)...
Finished checking for 'X' pop-ups.

--- Attempting to scrape first 3 post IDs ---
Attempt #1 to find articles...
  Found 7 article elements on page in attempt 1.
    Article 0: Found link using XPath variant 2
    Article 0: Link href: https://xueqiu.com/5367879511/334490187
      Href 'https://xueqiu.com/5367879511/334490187' does not have '/user_id/post_id' structure after splitting.
    Article 1: Found link using XPath variant 2
    Article 1: Link href: https://xueqiu.com/1760673340/334516549
      Href 'https://xueqiu.com/1760673340/334516549' does not have '/user_id/post_id' structure after splitting.
    Article 2: Found link using XPath variant 2
    Article 2: Link href: https://xueqiu.com/77892069

In [23]:
import time
import os
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import (
    TimeoutException,
    NoSuchElementException,
    ElementClickInterceptedException,
    StaleElementReferenceException,
    ElementNotInteractableException
)
from urllib.parse import urlparse # Import for URL parsing

def scrape_main_feed_post_links(url="https://xueqiu.com/", max_scroll_attempts=5):
    print(f"Setting up WebDriver for {url}...")
    options = webdriver.ChromeOptions()
    # options.add_argument("--headless")
    options.add_argument("--disable-gpu"); options.add_argument("--no-sandbox")
    options.add_argument("--disable-dev-shm-usage"); options.add_argument("--window-size=1200,900")
    options.add_argument("--disable-notifications"); options.add_argument("--lang=zh-CN")
    options.add_argument("user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/100.0.4896.127 Safari/537.36")
    options.add_experimental_option('excludeSwitches', ['enable-logging'])

    screenshot_dir = "screenshots_main_feed_links"
    if not os.path.exists(screenshot_dir):
        try: os.makedirs(screenshot_dir); print(f"Created '{screenshot_dir}' directory.")
        except OSError as e: print(f"Error creating screenshot directory: {e}"); return []

    driver = None
    # Using a set for unique_links to ensure we don't add duplicates
    unique_links_found = set()

    try:
        driver = webdriver.Chrome(options=options)
        driver.maximize_window()
        # General wait not used as much here, waits are more specific
        # wait = WebDriverWait(driver, 20)
        # interaction_wait = WebDriverWait(driver, 10)

        print(f"Navigating to: {url}")
        driver.get(url)
        try:
            WebDriverWait(driver, 20).until(
                EC.presence_of_element_located((By.XPATH, "//article[contains(@class, 'timeline__item')] | //div[contains(@class, 'home_timeline')] | //div[contains(@class, 'list__container')]"))
            )
            print("Main feed container indicator loaded.")
        except TimeoutException:
            print("Main feed container indicator did not load."); driver.save_screenshot(os.path.join(screenshot_dir,"error_page_load.png")); return []
        time.sleep(3) 

        # --- Handle Initial Pop-ups ---
        try:
            print("Looking for '跳过' (Skip) pop-up..."); skip_xpath = "//span[text()='跳过'] | //button[contains(.,'跳过')]"; skip_button = WebDriverWait(driver, 7).until(EC.element_to_be_clickable((By.XPATH, skip_xpath)))
            driver.execute_script("arguments[0].click();", skip_button); print("Clicked '跳过'."); time.sleep(0.5)
        except TimeoutException: print("'跳过' pop-up not found/timed out.")
        except Exception as e: print(f"Error clicking '跳过': {e}")
        try:
            print("Looking for 'X' pop-up (modal close)..."); close_xpaths = [ "//div[contains(@class,'modal-wrapper')]//i[contains(@class,'icon-close')]", "//i[contains(@class, 'cube-dialog-close')]", "//div[@class='optional_tip']/i[@class='icon-close']" ]
            for xpath_item in close_xpaths:
                 try: close_button = WebDriverWait(driver, 3).until(EC.element_to_be_clickable((By.XPATH, xpath_item))); driver.execute_script("arguments[0].click();", close_button); print(f"Clicked 'X' for pop-up using: {xpath_item}"); time.sleep(0.5); break
                 except: continue
            print("Finished checking for 'X' pop-ups.")
        except Exception as e: print(f"Error 'X' pop-up: {e}")
        
        time.sleep(1)

        print(f"\n--- Attempting to scrape all unique post links (max scrolls: {max_scroll_attempts}) ---")
        article_xpath = "//article[contains(@class, 'timeline__item') or contains(@class, 'home__timeline__item')]"
        
        last_num_links_found = -1

        for attempt in range(max_scroll_attempts):
            current_links_before_scroll = len(unique_links_found)
            print(f"Scroll Attempt #{attempt+1}...")
            
            article_elements = driver.find_elements(By.XPATH, article_xpath)
            print(f"  Found {len(article_elements)} article elements on page.")

            if not article_elements and attempt == 0: # Special wait if nothing on first try
                print("  No articles found on first pass, waiting a bit more...")
                time.sleep(5)
                article_elements = driver.find_elements(By.XPATH, article_xpath)
                print(f"  Found {len(article_elements)} article elements after extra wait.")

            for article_idx, article in enumerate(article_elements):
                try:
                    link_element = None
                    href_value = None
                    
                    # Define XPaths to try for the link
                    xpath_for_link_v1 = ".//div[contains(@class, 'timeline_item_content')]//a[contains(@class, 'fake-anchor_') and (starts-with(@href, '/') or starts-with(@href, 'http'))]"
                    xpath_for_link_v2 = ".//a[contains(@class, 'fake-anchor_') and (starts-with(@href, '/') or starts-with(@href, 'http'))]"
                    xpath_for_link_v3 = ".//a[(starts-with(@href, '/') or starts-with(@href, 'http')) and not(contains(@class, 'avatar')) and .//h3]"
                    
                    link_xpaths_to_try = [xpath_for_link_v1, xpath_for_link_v2, xpath_for_link_v3]
                    
                    for i_xpath, link_xpath in enumerate(link_xpaths_to_try):
                        try:
                            # Use find_elements to catch all matching links, then process the first valid one
                            # Some articles might have multiple such links, we want the primary post link.
                            # Heuristic: the first one is usually the main link for the article title/content.
                            potential_links = article.find_elements(By.XPATH, link_xpath)
                            if potential_links:
                                link_element = potential_links[0] # Take the first one
                                print(f"    Article {article_idx}: Found link using XPath variant {i_xpath+1}")
                                break 
                        except NoSuchElementException: # Should not happen with find_elements, but as a safeguard
                            pass # Handled by if potential_links
                        except StaleElementReferenceException:
                            print(f"    Article {article_idx}: Stale element reference trying XPath variant {i_xpath+1}. Skipping this article.")
                            link_element = "STALE" # Mark as stale to break from article processing
                            break


                    if link_element == "STALE":
                        continue # Skip to next article

                    if link_element:
                        href_value = link_element.get_attribute('href')
                        if href_value:
                            # Ensure it's a full URL if it's a relative path
                            if href_value.startswith("/"):
                                href_value = "https://xueqiu.com" + href_value
                            
                            if href_value not in unique_links_found:
                                print(f"    Article {article_idx}: Adding NEW unique link: {href_value}")
                                unique_links_found.add(href_value)
                            # else:
                            #    print(f"    Article {article_idx}: Link already found: {href_value}")
                        else:
                            print(f"    Article {article_idx}: Link found but href is empty.")
                    elif i_xpath == len(link_xpaths_to_try) -1 : # Only print if all variants failed
                        print(f"    Article {article_idx}: All XPath variants for link failed to find an element.")


                except StaleElementReferenceException:
                    print(f"    Article {article_idx} became stale. Skipping.")
                except Exception as e_article:
                    print(f"    Error processing article {article_idx}: {type(e_article).__name__} - {e_article}")

            # Scroll down to load more content
            if attempt < max_scroll_attempts -1: # Don't scroll on the last attempt
                print(f"  Scrolling down to load more articles... (Found {len(unique_links_found)} unique links so far)")
                driver.execute_script("window.scrollTo(0, document.body.scrollHeight);") 
                time.sleep(4) # Increased wait time after scroll for content to load
                
                if len(unique_links_found) == current_links_before_scroll and len(article_elements) > 0:
                    print("  No new links found after scroll, and articles were present. Might be end of feed or slow load.")
                    # Consider breaking if no new links for a couple of scrolls
                    if last_num_links_found == len(unique_links_found):
                        print("  Number of unique links hasn't changed for two scrolls. Ending.")
                        break
                last_num_links_found = len(unique_links_found)
            
        if not unique_links_found:
            print("Could not find any post links after all attempts. Saving a screenshot.")
            driver.save_screenshot(os.path.join(screenshot_dir, "no_post_links_found.png"))
        
        final_links_list = list(unique_links_found)

    except Exception as e:
        print(f"\n--- An critical error occurred ---"); print(f"Error Type: {type(e).__name__}"); print(f"Error Details: {e}")
        if driver:
            try: error_ss_path = os.path.join(screenshot_dir, "critical_error_main_feed.png"); driver.save_screenshot(error_ss_path); print(f"Saved critical error screenshot to {error_ss_path}")
            except Exception as e_ss_crit: print(f"Could not save critical error screenshot: {e_ss_crit}")
        final_links_list = list(unique_links_found) # Return what we have so far
    finally:
        if driver: print("Closing the browser..."); driver.quit(); print("Browser closed.")
    
    return final_links_list

if __name__ == "__main__":
    print(f"--- Starting Scraper for Main Xueqiu Feed Post Links ---")
    # Set max_scroll_attempts to control how many times it tries to load more content
    # For "all" on a very long page, this might need to run for a while.
    extracted_links = scrape_main_feed_post_links(max_scroll_attempts=3) # Adjust as needed
    
    print("\n" + "="*30); print("      Scraped Post Links"); print("="*30)
    if extracted_links:
        print(f"Successfully scraped {len(extracted_links)} unique post links:")
        for i, link_url in enumerate(extracted_links): print(f"{i+1}. {link_url}")
    else: print("\n>>> No post links were scraped. <<<")
    print("\n" + "="*30); print(f"Check console logs and the '{scrape_main_feed_post_links.__defaults__[0]}' folder for details.")

--- Starting Scraper for Main Xueqiu Feed Post Links ---
Setting up WebDriver for https://xueqiu.com/...
Created 'screenshots_main_feed_links' directory.
Navigating to: https://xueqiu.com/
Main feed container indicator loaded.
Looking for '跳过' (Skip) pop-up...
'跳过' pop-up not found/timed out.
Looking for 'X' pop-up (modal close)...
Finished checking for 'X' pop-ups.

--- Attempting to scrape all unique post links (max scrolls: 3) ---
Scroll Attempt #1...
  Found 7 article elements on page.
    Article 0: Found link using XPath variant 2
    Article 0: Adding NEW unique link: https://xueqiu.com/5367879511/334490187
    Article 1: Found link using XPath variant 2
    Article 1: Adding NEW unique link: https://xueqiu.com/1760673340/334516549
    Article 2: Found link using XPath variant 2
    Article 2: Adding NEW unique link: https://xueqiu.com/3609236100/334567815
    Article 3: Found link using XPath variant 2
    Article 3: Adding NEW unique link: https://xueqiu.com/9081441836/3345071

In [25]:
import time
import os
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import (
    TimeoutException,
    NoSuchElementException,
    ElementClickInterceptedException,
    StaleElementReferenceException,
    ElementNotInteractableException
)
from urllib.parse import urlparse # For robust href parsing if needed, though data-tooltip is primary

def scrape_post_and_all_comments_with_author_ids(url, max_main_loops=20, scroll_pause_time=2.5):
    print("Setting up WebDriver...")
    options = webdriver.ChromeOptions()
    # options.add_argument("--headless")
    options.add_argument("--disable-gpu"); options.add_argument("--no-sandbox")
    options.add_argument("--disable-dev-shm-usage"); options.add_argument("--window-size=1200,900")
    options.add_argument("--disable-notifications"); options.add_argument("--lang=zh-CN")
    options.add_argument("user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/100.0.4896.127 Safari/537.36")
    options.add_experimental_option('excludeSwitches', ['enable-logging'])

    screenshot_dir = "screenshots_comment_authors"
    if not os.path.exists(screenshot_dir):
        try: os.makedirs(screenshot_dir); print(f"Created '{screenshot_dir}' directory.")
        except OSError as e: print(f"Error creating screenshot directory: {e}"); return {"post_content": None, "post_author_id": None, "comments": []}

    driver = None
    scraped_data = {"post_content": None, "post_author_id": None, "comments": []}
    unique_comment_texts_scraped = set()

    try:
        driver = webdriver.Chrome(options=options)
        driver.maximize_window()
        wait = WebDriverWait(driver, 20)
        interaction_wait = WebDriverWait(driver, 7)

        print(f"Navigating to: {url}")
        driver.get(url)
        try:
            WebDriverWait(driver, 20).until(EC.presence_of_element_located((By.XPATH, "//div[contains(@class, 'article__bd__detail')] | //div[contains(@class, 'article__content')]")))
            print("Article body indicator loaded.")
        except TimeoutException:
            print("Article body indicator did not load."); driver.save_screenshot(os.path.join(screenshot_dir,"error_page_load.png")); return scraped_data
        time.sleep(2)

        # --- Pop-up Handling ---
        try:
            print("Looking for '跳过' pop-up..."); skip_xpath = "//span[text()='跳过'] | //button[contains(.,'跳过')]"; skip_button = WebDriverWait(driver, 10).until(EC.element_to_be_clickable((By.XPATH, skip_xpath)))
            driver.execute_script("arguments[0].click();", skip_button); print("Clicked '跳过'."); time.sleep(0.5)
        except TimeoutException: print("'跳过' pop-up not found/timed out.")
        except Exception as e: print(f"Error '跳过': {e}")
        try:
            print("Looking for 'X' pop-up..."); close_xpaths = [ "//div[contains(@class,'modal-wrapper')]//i[contains(@class,'icon-close')]", "//i[contains(@class, 'cube-dialog-close')]" ]
            for xpath_item in close_xpaths:
                 try: close_button = WebDriverWait(driver, 4).until(EC.element_to_be_clickable((By.XPATH, xpath_item))); driver.execute_script("arguments[0].click();", close_button); print("Clicked 'X'."); time.sleep(0.5); break
                 except: continue
            print("Finished checking for 'X' pop-ups.")
        except Exception as e: print(f"Error 'X' pop-up: {e}")

        # Scrape Main Post Content and Author ID
        try:
            print("Scraping main post content and author ID...")
            post_content_xpath = "//div[contains(@class, 'article__bd__detail')] | //div[contains(@class, 'article__content')]"
            post_element = wait.until(EC.visibility_of_element_located((By.XPATH, post_content_xpath)))
            scraped_data["post_content"] = post_element.text.strip()
            print(f"Post content scraped (length: {len(scraped_data['post_content'])}).")
            try: # Post author ID
                post_author_link_xpath = "//div[contains(@class, 'article__author')]//a[@data-tooltip and starts-with(@href, '/')]"
                author_link_element = wait.until(EC.presence_of_element_located((By.XPATH, post_author_link_xpath)))
                author_id_val = author_link_element.get_attribute('data-tooltip')
                if author_id_val and author_id_val.isdigit(): scraped_data["post_author_id"] = author_id_val
                else:
                    href = author_link_element.get_attribute('href')
                    if href: potential_id = href.strip('/').split('/')[-1]
                    if potential_id.isdigit(): scraped_data["post_author_id"] = potential_id
                print(f"Post author ID scraped: {scraped_data.get('post_author_id', 'N/A')}")
            except TimeoutException: print("Post author link (for ID) not found.")
            except Exception as e_author: print(f"Error scraping post author ID: {e_author}")
        except Exception as e_post: print(f"Error scraping post content/author: {e_post}")

        print("\n--- Starting scroll and comment extraction ---")
        last_total_unique_texts_count = -1

        for i in range(max_main_loops):
            print(f"--- Main Loop Iteration #{i+1} ---")
            action_taken_this_loop = False

            # Expand Replies Logic (kept from your working version)
            expand_reply_xpath = "//a[contains(text(), '查看') and contains(text(), '条回复') and not(ancestor::div[contains(@style,'display: none')]) and not(ancestor::div[contains(@class,'hide')])]"
            expand_attempts = 0
            while expand_attempts < 5: 
                expand_attempts += 1; clicked_an_expand_button_this_pass = False
                try:
                    visible_expand_buttons = [b for b in driver.find_elements(By.XPATH, expand_reply_xpath) if b.is_displayed() and b.is_enabled()]
                    if not visible_expand_buttons:
                        if expand_attempts > 1: print(f"  No more 'Expand Replies' visible in sub-attempt {expand_attempts}.")
                        break
                    for button in visible_expand_buttons:
                        try:
                            if not button.is_displayed() or not button.is_enabled(): continue
                            driver.execute_script("arguments[0].scrollIntoView({block: 'center', behavior: 'smooth'});", button); time.sleep(0.4)
                            button_to_click = WebDriverWait(driver, 3).until(EC.element_to_be_clickable(button))
                            driver.execute_script("arguments[0].click();", button_to_click)
                            action_taken_this_loop = True; clicked_an_expand_button_this_pass = True; time.sleep(1.5)
                        except (StaleElementReferenceException, TimeoutException, ElementNotInteractableException): continue
                        except Exception as e_expand: print(f"    Error clicking one 'Expand Replies': {e_expand}")
                    if not clicked_an_expand_button_this_pass and expand_attempts > 1 : break
                    if clicked_an_expand_button_this_pass : time.sleep(0.5)
                except Exception as e_find_expand: print(f"  Error finding 'Expand Replies': {e_find_expand}"); break

            print("  Scrolling down...")
            driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
            time.sleep(scroll_pause_time)
            
            # Using the p_tag XPath that worked for text, and includes the double-underscore version from logs
            comment_p_tag_xpath = "//div[@class='comment__item__main']/p | //div[contains(@class, 'comment__content')]/p"
            # The key is that your screenshot showed comment_item_main (single underscore) for the HIS1963 comment.
            # Let's ensure we prioritize the structure from the screenshot.
            # The first part of the OR should ideally match the screenshot.
            # If the log said 'comment__item__main' was parent of p_tags, let's try that too.
            # For now, sticking to the structure that should lead to comment_item (single underscore) ancestor.
            # If comment_item_main (single) is parent of P, then ancestor::div[@class='comment_item'] should work.

            try:
                comment_p_tags = driver.find_elements(By.XPATH, comment_p_tag_xpath)
                new_comments_added_to_list_this_pass = 0
                
                if comment_p_tags: print(f"    Found {len(comment_p_tags)} potential comment <p> tags for text.")

                for p_tag_idx, p_tag in enumerate(comment_p_tags):
                    comment_text = ""; author_id = None
                    try:
                        comment_text = p_tag.text.strip()
                        if not comment_text: continue

                        if comment_text not in unique_comment_texts_scraped:
                            unique_comment_texts_scraped.add(comment_text)
                            print(f"        Processing new unique text (idx {p_tag_idx}): '{comment_text[:30]}...'")
                            author_id = None 
                            comment_item_boundary = None # Initialize

                            try:
                                # Attempt to find the 'comment_item' ancestor, exactly as per your screenshot structure
                                comment_item_boundary = p_tag.find_element(By.XPATH, "ancestor::div[@class='comment_item' and @data-id][1]")
                                boundary_data_id = comment_item_boundary.get_attribute('data-id')
                                print(f"            p_idx {p_tag_idx}: Found 'comment_item' boundary (data-id: {boundary_data_id}) for text '{comment_text[:20]}...'")
                                
                                # 1. Try user-name link (most specific to your screenshot)
                                try:
                                    author_link_el = comment_item_boundary.find_element(By.XPATH, ".//div[@class='comment_item_main_hd']//a[@class='user-name' and @data-tooltip]")
                                    print(f"                p_idx {p_tag_idx}: Found 'user-name' link.")
                                    temp_author_id = author_link_el.get_attribute('data-tooltip')
                                    if temp_author_id and temp_author_id.isdigit(): author_id = temp_author_id
                                    else:
                                        href = author_link_el.get_attribute('href')
                                        if href: potential_id = href.strip('/').split('/')[-1];
                                        if potential_id and potential_id.isdigit(): author_id = potential_id # check potential_id is not empty
                                    if author_id: print(f"                    p_idx {p_tag_idx}: Extracted author_id: {author_id} (from user-name)")
                                
                                except NoSuchElementException:
                                    print(f"                p_idx {p_tag_idx}: 'user-name' link NOT found in 'comment_item' (data-id: {boundary_data_id}).")
                                    # 2. Fallback to avatar link (also specific to screenshot structure from comment_item)
                                    try:
                                        author_link_el = comment_item_boundary.find_element(By.XPATH, "./a[@class='avatar' and @data-tooltip]") # Avatar is direct child of comment_item
                                        print(f"                    p_idx {p_tag_idx}: Found 'avatar' link (direct child of comment_item).")
                                        temp_author_id = author_link_el.get_attribute('data-tooltip')
                                        if temp_author_id and temp_author_id.isdigit(): author_id = temp_author_id
                                        else:
                                            href = author_link_el.get_attribute('href')
                                            if href: potential_id = href.strip('/').split('/')[-1];
                                            if potential_id and potential_id.isdigit(): author_id = potential_id
                                        if author_id: print(f"                        p_idx {p_tag_idx}: Extracted author_id: {author_id} (from avatar)")
                                    except NoSuchElementException:
                                        print(f"                    p_idx {p_tag_idx}: 'avatar' link (direct child) also NOT found in 'comment_item' (data-id: {boundary_data_id}).")
                                        
                            except NoSuchElementException:
                                print(f"            p_idx {p_tag_idx}: MAJOR FAIL: Could NOT find 'comment_item' (class='comment_item' and @data-id) ancestor for p_tag. Text: '{comment_text[:20]}...'")
                            except Exception as e_author_find:
                                print(f"            p_idx {p_tag_idx}: Error finding author ID for text '{comment_text[:20]}...': {type(e_author_find).__name__} - {e_author_find}")

                            scraped_data["comments"].append({"text": comment_text, "author_id": author_id})
                            new_comments_added_to_list_this_pass += 1
                            
                    except StaleElementReferenceException: print(f"      Stale p_tag {p_tag_idx} encountered.") ; continue 
                    except Exception as e_proc_p: print(f"      Error processing one p_tag (idx {p_tag_idx}): {type(e_proc_p).__name__} - {e_proc_p}")
                
                if new_comments_added_to_list_this_pass > 0:
                    print(f"    Added {new_comments_added_to_list_this_pass} new unique comments to data list.")
                    action_taken_this_loop = True
            except Exception as e_find_comments: print(f"  Error finding comment <p> tags: {e_find_comments}")

            # Load More Comments Logic
            load_more_comments_xpath = "//a[@class='show_more' and .//span[text()='展开查看更多']]"
            try:
                load_more_button = interaction_wait.until(EC.element_to_be_clickable((By.XPATH, load_more_comments_xpath)))
                driver.execute_script("arguments[0].scrollIntoView({block: 'center', behavior: 'smooth'});", load_more_button); time.sleep(0.4)
                driver.execute_script("arguments[0].click();", load_more_button)
                action_taken_this_loop = True; time.sleep(scroll_pause_time + 0.5)
            except TimeoutException: pass # Button not always present
            except Exception as e_load_more: print(f"  Error clicking '展开查看更多': {e_load_more}")

            current_unique_texts_count = len(unique_comment_texts_scraped)
            print(f"  Loop {i+1} end. Total unique comment texts scraped: {current_unique_texts_count}. Previously: {last_total_unique_texts_count}. Total comments in list: {len(scraped_data['comments'])}")

            if not action_taken_this_loop and current_unique_texts_count == last_total_unique_texts_count:
                if last_total_unique_texts_count != -1 or i > 0 : 
                    print("No actions taken and unique comment text count unchanged. Assuming completion.")
                    break
            last_total_unique_texts_count = current_unique_texts_count
            if i == max_main_loops - 1: print("Reached max main loops.")
            driver.save_screenshot(os.path.join(screenshot_dir,f"main_loop_end_{i+1}.png"))

        print(f"\n--- Finished comment scraping. Total comments in list: {len(scraped_data['comments'])} ---")

    except Exception as e:
        print(f"\n--- An critical error occurred ---"); print(f"Error Type: {type(e).__name__}"); print(f"Error Details: {e}")
        if driver:
            try: error_ss_path = os.path.join(screenshot_dir, "critical_error_comment_authors.png"); driver.save_screenshot(error_ss_path); print(f"Saved critical error screenshot.")
            except Exception as e_ss_crit: print(f"Could not save critical error screenshot: {e_ss_crit}")
    finally:
        if driver: print("Closing the browser..."); driver.quit(); print("Browser closed.")
    return scraped_data

if __name__ == "__main__":
    target_url = "https://xueqiu.com/5669998349/334081638" # Your example post
    # target_url = "https://xueqiu.com/1929796349/272374603" # Example with many comments & replies for testing robustness
    print(f"--- Starting Scraper for URL: {target_url} (Comment Author IDs) ---")
    
    data = scrape_post_and_all_comments_with_author_ids(target_url, max_main_loops=3, scroll_pause_time=2.5)

    print("\n" + "="*30); print("      Scraped Data Summary"); print("="*30)
    if data["post_content"]:
        print("\n--- Main Post ---"); print(f"Author ID: {data.get('post_author_id', 'N/A')}"); print(data["post_content"])
    else: print("\n>>> Main post content not scraped. <<<")

    if data["comments"]:
        print(f"\n--- Comments ({len(data['comments'])}) ---")
        for idx, comment_data_item in enumerate(data["comments"]):
            print(f"{idx+1}. Author ID: {comment_data_item.get('author_id', 'N/A')}, Comment: {comment_data_item['text']}")
    else: print("\n>>> No comments were scraped. <<<")
    print("\n" + "="*30); print(f"Check console logs and 'screenshots_comment_authors' folder for details.")

--- Starting Scraper for URL: https://xueqiu.com/5669998349/334081638 (Comment Author IDs) ---
Setting up WebDriver...
Created 'screenshots_comment_authors' directory.
Navigating to: https://xueqiu.com/5669998349/334081638
Article body indicator loaded.
Looking for '跳过' pop-up...
'跳过' pop-up not found/timed out.
Looking for 'X' pop-up...
Finished checking for 'X' pop-ups.
Scraping main post content and author ID...
Post content scraped (length: 706).
Post author ID scraped: 5669998349

--- Starting scroll and comment extraction ---
--- Main Loop Iteration #1 ---
  Scrolling down...
    Found 19 potential comment <p> tags for text.
        Processing new unique text (idx 0): '十年二十年后，大概也这样了，会出什么龙头公司股票呢？...'
            p_idx 0: MAJOR FAIL: Could NOT find 'comment_item' (class='comment_item' and @data-id) ancestor for p_tag. Text: '十年二十年后，大概也这样了，会出什么龙头...'
        Processing new unique text (idx 1): '没有人喜欢照顾别人的情绪，但是所有人都想从别人那里找到情绪价...'
            p_idx 1: MAJOR FAIL: Could NOT find 'comme

In [27]:
import time
import os
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import (
    TimeoutException,
    NoSuchElementException,
    ElementClickInterceptedException,
    StaleElementReferenceException,
    ElementNotInteractableException
)

def scrape_post_and_all_comments_final_attempt(url, max_main_loops=20, scroll_pause_time=2.5):
    print("Setting up WebDriver...")
    options = webdriver.ChromeOptions()
    # options.add_argument("--headless")
    options.add_argument("--disable-gpu"); options.add_argument("--no-sandbox")
    options.add_argument("--disable-dev-shm-usage"); options.add_argument("--window-size=1200,900")
    options.add_argument("--disable-notifications"); options.add_argument("--lang=zh-CN")
    options.add_argument("user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/100.0.4896.127 Safari/537.36")
    options.add_experimental_option('excludeSwitches', ['enable-logging'])

    screenshot_dir = "screenshots_comment_authors_final"
    if not os.path.exists(screenshot_dir):
        try: os.makedirs(screenshot_dir); print(f"Created '{screenshot_dir}' directory.")
        except OSError as e: print(f"Error creating screenshot directory: {e}"); return {"post_content": None, "post_author_id": None, "comments": []}

    driver = None
    scraped_data = {"post_content": None, "post_author_id": None, "comments": []}
    unique_comment_texts_scraped = set()

    try:
        driver = webdriver.Chrome(options=options)
        driver.maximize_window()
        wait = WebDriverWait(driver, 20)
        interaction_wait = WebDriverWait(driver, 7)

        print(f"Navigating to: {url}")
        driver.get(url)
        try:
            WebDriverWait(driver, 20).until(EC.presence_of_element_located((By.XPATH, "//div[contains(@class, 'article__bd__detail')] | //div[contains(@class, 'article__content')]")))
            print("Article body indicator loaded.")
        except TimeoutException:
            print("Article body indicator did not load."); driver.save_screenshot(os.path.join(screenshot_dir,"error_page_load.png")); return scraped_data
        time.sleep(2)

        # --- Pop-up Handling ---
        try:
            print("Looking for '跳过' pop-up..."); skip_xpath = "//span[text()='跳过'] | //button[contains(.,'跳过')]"; skip_button = WebDriverWait(driver, 10).until(EC.element_to_be_clickable((By.XPATH, skip_xpath)))
            driver.execute_script("arguments[0].click();", skip_button); print("Clicked '跳过'."); time.sleep(0.5)
        except TimeoutException: print("'跳过' pop-up not found/timed out.")
        except Exception as e: print(f"Error '跳过': {e}")
        try:
            print("Looking for 'X' pop-up..."); close_xpaths = [ "//div[contains(@class,'modal-wrapper')]//i[contains(@class,'icon-close')]", "//i[contains(@class, 'cube-dialog-close')]" ]
            for xpath_item in close_xpaths:
                 try: close_button = WebDriverWait(driver, 4).until(EC.element_to_be_clickable((By.XPATH, xpath_item))); driver.execute_script("arguments[0].click();", close_button); print("Clicked 'X'."); time.sleep(0.5); break
                 except: continue
            print("Finished checking for 'X' pop-ups.")
        except Exception as e: print(f"Error 'X' pop-up: {e}")

        # Scrape Main Post Content and Author ID
        try:
            print("Scraping main post content and author ID...")
            post_content_xpath = "//div[contains(@class, 'article__bd__detail')] | //div[contains(@class, 'article__content')]"
            post_element = wait.until(EC.visibility_of_element_located((By.XPATH, post_content_xpath)))
            scraped_data["post_content"] = post_element.text.strip()
            print(f"Post content scraped (length: {len(scraped_data['post_content'])}).")
            try: 
                post_author_link_xpath = "//div[contains(@class, 'article__author')]//a[@data-tooltip and starts-with(@href, '/')]"
                author_link_element = wait.until(EC.presence_of_element_located((By.XPATH, post_author_link_xpath)))
                author_id_val = author_link_element.get_attribute('data-tooltip')
                if author_id_val and author_id_val.isdigit(): scraped_data["post_author_id"] = author_id_val
                else:
                    href = author_link_element.get_attribute('href')
                    if href: potential_id = href.strip('/').split('/')[-1]
                    if potential_id.isdigit(): scraped_data["post_author_id"] = potential_id
                print(f"Post author ID scraped: {scraped_data.get('post_author_id', 'N/A')}")
            except TimeoutException: print("Post author link (for ID) not found.")
            except Exception as e_author: print(f"Error scraping post author ID: {e_author}")
        except Exception as e_post: print(f"Error scraping post content/author: {e_post}")

        print("\n--- Starting scroll and comment extraction ---")
        last_total_unique_texts_count = -1

        for i in range(max_main_loops):
            print(f"--- Main Loop Iteration #{i+1} ---")
            action_taken_this_loop = False

            expand_reply_xpath = "//a[contains(text(), '查看') and contains(text(), '条回复') and not(ancestor::div[contains(@style,'display: none')]) and not(ancestor::div[contains(@class,'hide')])]"
            expand_attempts = 0
            while expand_attempts < 5: 
                expand_attempts += 1; clicked_an_expand_button_this_pass = False
                try:
                    visible_expand_buttons = [b for b in driver.find_elements(By.XPATH, expand_reply_xpath) if b.is_displayed() and b.is_enabled()]
                    if not visible_expand_buttons:
                        if expand_attempts > 1: print(f"  No more 'Expand Replies' visible in sub-attempt {expand_attempts}.")
                        break
                    for button in visible_expand_buttons:
                        try:
                            if not button.is_displayed() or not button.is_enabled(): continue
                            driver.execute_script("arguments[0].scrollIntoView({block: 'center', behavior: 'smooth'});", button); time.sleep(0.4)
                            button_to_click = WebDriverWait(driver, 3).until(EC.element_to_be_clickable(button))
                            driver.execute_script("arguments[0].click();", button_to_click)
                            action_taken_this_loop = True; clicked_an_expand_button_this_pass = True; time.sleep(1.5)
                        except (StaleElementReferenceException, TimeoutException, ElementNotInteractableException): continue
                        except Exception as e_expand: print(f"    Error clicking one 'Expand Replies': {e_expand}")
                    if not clicked_an_expand_button_this_pass and expand_attempts > 1 : break
                    if clicked_an_expand_button_this_pass : time.sleep(0.5)
                except Exception as e_find_expand: print(f"  Error finding 'Expand Replies': {e_find_expand}"); break

            print("  Scrolling down...")
            driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
            time.sleep(scroll_pause_time)
            
            # XPath for comment text <p> tags.
            # Prioritizing the structure from your screenshot: div.comment_item_main (single underscore)
            # Fallback to div.comment__item__main (double underscore from logs) or div.comment__content (for replies)
            comment_p_tag_xpath = ("//div[@class='comment_item_main']/p " # Exact match for screenshot structure
                                   "| //div[@class='comment__item__main']/p " # Match from logs
                                   "| //div[contains(@class, 'comment__content')]/p") # For replies

            try:
                comment_p_tags = driver.find_elements(By.XPATH, comment_p_tag_xpath)
                new_comments_added_to_list_this_pass = 0
                
                if comment_p_tags: print(f"    Found {len(comment_p_tags)} potential comment <p> tags for text.")

                for p_tag_idx, p_tag in enumerate(comment_p_tags):
                    comment_text = ""; author_id = None; p_parent_div = None
                    try:
                        comment_text = p_tag.text.strip()
                        if not comment_text: continue

                        if comment_text not in unique_comment_texts_scraped:
                            unique_comment_texts_scraped.add(comment_text)
                            print(f"        Processing new unique text (idx {p_tag_idx}): '{comment_text[:30]}...'")
                            
                            try:
                                p_parent_div = p_tag.find_element(By.XPATH, "./parent::div")
                                parent_class = p_parent_div.get_attribute('class')
                                print(f"            p_idx {p_tag_idx}: Parent of p_tag is <div class='{parent_class}'>")

                                # Strategy 1: Based on screenshot (p_parent is div.comment_item_main)
                                if "comment_item_main" == parent_class or "comment__item__main" == parent_class: # Allow single or double underscore
                                    try:
                                        header_div = p_parent_div.find_element(By.XPATH, "./div[@class='comment_item_main_hd']") # Exact class
                                        print(f"                p_idx {p_tag_idx}: Found header_div <div class='{header_div.get_attribute('class')}'>")
                                        author_link_el = header_div.find_element(By.XPATH, "./a[@class='user-name' and @data-tooltip]")
                                        print(f"                    p_idx {p_tag_idx}: Found 'user-name' link.")
                                        temp_author_id = author_link_el.get_attribute('data-tooltip')
                                        if temp_author_id and temp_author_id.isdigit(): author_id = temp_author_id
                                        else:
                                            href = author_link_el.get_attribute('href')
                                            if href: potential_id = href.strip('/').split('/')[-1];
                                            if potential_id and potential_id.isdigit(): author_id = potential_id
                                        if author_id: print(f"                        p_idx {p_tag_idx}: Extracted author_id: {author_id} (from user-name)")
                                    except NoSuchElementException:
                                        print(f"                p_idx {p_tag_idx}: 'user-name' structure NOT found under <div class='{parent_class}'>.")
                                
                                # Fallback: If author_id still None, try avatar from grandparent (div.comment_item)
                                if author_id is None and p_parent_div is not None: # Check p_parent_div exists
                                    print(f"            p_idx {p_tag_idx}: Author_id still None. Trying avatar fallback.")
                                    try:
                                        # Parent of 'comment_item_main' (or 'comment__item__main') should be 'comment_item'
                                        comment_item_div = p_parent_div.find_element(By.XPATH, "./parent::div[@class='comment_item' and @data-id]")
                                        item_data_id = comment_item_div.get_attribute('data-id')
                                        print(f"                p_idx {p_tag_idx}: Fallback: Found grandparent 'comment_item' (data-id: {item_data_id}).")
                                        author_link_el = comment_item_div.find_element(By.XPATH, "./a[@class='avatar' and @data-tooltip]") # Avatar is direct child
                                        print(f"                    p_idx {p_tag_idx}: Fallback: Found 'avatar' link.")
                                        temp_author_id = author_link_el.get_attribute('data-tooltip')
                                        if temp_author_id and temp_author_id.isdigit(): author_id = temp_author_id
                                        else:
                                            href = author_link_el.get_attribute('href')
                                            if href: potential_id = href.strip('/').split('/')[-1];
                                            if potential_id and potential_id.isdigit(): author_id = potential_id
                                        if author_id: print(f"                        p_idx {p_tag_idx}: Fallback: Extracted author_id: {author_id} (from avatar)")
                                    except NoSuchElementException:
                                        print(f"                p_idx {p_tag_idx}: Fallback: 'comment_item' grandparent or 'avatar' link NOT found.")
                                        
                            except NoSuchElementException:
                                print(f"            p_idx {p_tag_idx}: Parent <div> of p_tag NOT found. Cannot determine author. Text: '{comment_text[:20]}...'")
                            except Exception as e_author_find:
                                print(f"            p_idx {p_tag_idx}: Error finding author ID for text '{comment_text[:20]}...': {type(e_author_find).__name__} - {e_author_find}")

                            scraped_data["comments"].append({"text": comment_text, "author_id": author_id})
                            new_comments_added_to_list_this_pass += 1
                            
                    except StaleElementReferenceException: print(f"      Stale p_tag {p_tag_idx} encountered.") ; continue 
                    except Exception as e_proc_p: print(f"      Error processing one p_tag (idx {p_tag_idx}): {type(e_proc_p).__name__} - {e_proc_p}")
                
                if new_comments_added_to_list_this_pass > 0:
                    print(f"    Added {new_comments_added_to_list_this_pass} new unique comments to data list.")
                    action_taken_this_loop = True
            except Exception as e_find_comments: print(f"  Error finding comment <p> tags: {e_find_comments}")

            # Load More Comments Logic
            load_more_comments_xpath = "//a[@class='show_more' and .//span[text()='展开查看更多']]"
            try:
                load_more_button = interaction_wait.until(EC.element_to_be_clickable((By.XPATH, load_more_comments_xpath)))
                driver.execute_script("arguments[0].scrollIntoView({block: 'center', behavior: 'smooth'});", load_more_button); time.sleep(0.4)
                driver.execute_script("arguments[0].click();", load_more_button)
                action_taken_this_loop = True; time.sleep(scroll_pause_time + 0.5)
            except TimeoutException: pass 
            except Exception as e_load_more: print(f"  Error clicking '展开查看更多': {e_load_more}")

            current_unique_texts_count = len(unique_comment_texts_scraped)
            print(f"  Loop {i+1} end. Total unique comment texts scraped: {current_unique_texts_count}. Previously: {last_total_unique_texts_count}. Total comments in list: {len(scraped_data['comments'])}")

            if not action_taken_this_loop and current_unique_texts_count == last_total_unique_texts_count:
                if last_total_unique_texts_count != -1 or i > 0 : 
                    print("No actions taken and unique comment text count unchanged. Assuming completion.")
                    break
            last_total_unique_texts_count = current_unique_texts_count
            if i == max_main_loops - 1: print("Reached max main loops.")
            driver.save_screenshot(os.path.join(screenshot_dir,f"main_loop_end_{i+1}.png"))

        print(f"\n--- Finished comment scraping. Total comments in list: {len(scraped_data['comments'])} ---")

    except Exception as e:
        print(f"\n--- An critical error occurred ---"); print(f"Error Type: {type(e).__name__}"); print(f"Error Details: {e}")
        if driver:
            try: error_ss_path = os.path.join(screenshot_dir, "critical_error_comment_authors_final.png"); driver.save_screenshot(error_ss_path); print(f"Saved critical error screenshot.")
            except Exception as e_ss_crit: print(f"Could not save critical error screenshot: {e_ss_crit}")
    finally:
        if driver: print("Closing the browser..."); driver.quit(); print("Browser closed.")
    return scraped_data

if __name__ == "__main__":
    target_url = "https://xueqiu.com/5367879511/334490187" # Using your new example URL
    print(f"--- Starting Scraper for URL: {target_url} (Comment Author IDs - Final Attempt Structure) ---")
    
    data = scrape_post_and_all_comments_final_attempt(target_url, max_main_loops=3, scroll_pause_time=2.5)

    print("\n" + "="*30); print("      Scraped Data Summary"); print("="*30)
    if data["post_content"]:
        print("\n--- Main Post ---"); print(f"Author ID: {data.get('post_author_id', 'N/A')}"); print(data["post_content"])
    else: print("\n>>> Main post content not scraped. <<<")

    if data["comments"]:
        print(f"\n--- Comments ({len(data['comments'])}) ---")
        for idx, comment_data_item in enumerate(data["comments"]):
            print(f"{idx+1}. Author ID: {comment_data_item.get('author_id', 'N/A')}, Comment: {comment_data_item['text']}")
    else: print("\n>>> No comments were scraped. <<<")
    print("\n" + "="*30); print(f"Check console logs and 'screenshots_comment_authors_final' folder for details.")

--- Starting Scraper for URL: https://xueqiu.com/5367879511/334490187 (Comment Author IDs - Final Attempt Structure) ---
Setting up WebDriver...
Created 'screenshots_comment_authors_final' directory.
Navigating to: https://xueqiu.com/5367879511/334490187
Article body indicator loaded.
Looking for '跳过' pop-up...
'跳过' pop-up not found/timed out.
Looking for 'X' pop-up...
Finished checking for 'X' pop-ups.
Scraping main post content and author ID...
Post content scraped (length: 1032).
Post author ID scraped: 5367879511

--- Starting scroll and comment extraction ---
--- Main Loop Iteration #1 ---
  Scrolling down...
    Found 20 potential comment <p> tags for text.
        Processing new unique text (idx 0): '原来你是省心省力...'
            p_idx 0: Parent of p_tag is <div class='comment__item__main'>
                p_idx 0: 'user-name' structure NOT found under <div class='comment__item__main'>.
            p_idx 0: Author_id still None. Trying avatar fallback.
                p_idx 0: Fallba