In [10]:
!pip install selenium webdriver-manager pandas

Collecting webdriver-manager
  Downloading webdriver_manager-4.0.2-py2.py3-none-any.whl.metadata (12 kB)
Downloading webdriver_manager-4.0.2-py2.py3-none-any.whl (27 kB)
Installing collected packages: webdriver-manager
Successfully installed webdriver-manager-4.0.2


In [138]:
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.options import Options
from webdriver_manager.chrome import ChromeDriverManager
import pandas as pd
import time
import re

# === Setup Chrome ===
options = Options()
options.add_argument("--start-maximized")
options.add_argument("--disable-blink-features=AutomationControlled")

driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=options)

# === Login manually ===
driver.get("https://www.zhihu.com/question/460425718")
input("🔐 Log in to Zhihu manually, then press Enter here to continue...")

# === Scroll settings ===
driver.set_script_timeout(60)
scroll_pause_time = 3.5
scroll_position = 0
increment = 500
scroll_count = 0
max_same_scrolls = 10
same_scroll_count = 0
start_time = time.time()
time_limit_sec = 9000
last_scroll_height = 0

while True:
    if time.time() - start_time > time_limit_sec:
        print("⏰ Time limit reached.")
        break

    try:
        scroll_height = driver.execute_script("return document.body.scrollHeight")
    except:
        scroll_height = scroll_position + increment * 2

    scroll_position += increment
    driver.execute_script(f"window.scrollTo(0, {scroll_position});")
    time.sleep(scroll_pause_time)
    scroll_count += 1

    if scroll_position >= scroll_height:
        same_scroll_count += 1
    else:
        same_scroll_count = 0

    if same_scroll_count >= max_same_scrolls:
        print("✅ No more content to scroll.")
        break

print(f"🔁 Finished scrolling after {scroll_count} scrolls.")
time.sleep(5)

# === Extract answers ===
answer_blocks = driver.find_elements(By.CSS_SELECTOR, ".List-item")
answers_data = []

for block in answer_blocks:
    try:
        author = block.find_element(By.CSS_SELECTOR, "meta[itemprop='name']").get_attribute("content").strip()
    except:
        author = ""

    try:
        answer = block.find_element(By.CSS_SELECTOR, ".RichContent-inner").text.strip()
    except:
        answer = ""

    try:
        upvote_text = block.find_element(By.CSS_SELECTOR, ".css-1lr85n").text.strip()
        upvotes = re.search(r"(\d+)", upvote_text).group(1)
    except:
        upvotes = ""

    try:
        full_text = block.text
        comments_match = re.search(r"(\d+)\s*条评论", full_text)
        comments = comments_match.group(1) if comments_match else ""
    except:
        comments = ""

    try:
        # Match text like "发布于 2022-06-15"
        post_date_element = block.find_element(By.XPATH, ".//*[contains(text(), '发布于')]")
        post_date_match = re.search(r"发布于\s+(\d{4}-\d{2}-\d{2})", post_date_element.text)
        post_date = post_date_match.group(1) if post_date_match else ""
    except:
        post_date = ""

    answers_data.append({
        "Author": author,
        "Date": post_date,
        "Upvotes": upvotes,
        "Comments": comments,
        "Answer": answer
    })

# === Save to CSV ===
df = pd.DataFrame(answers_data)
df.to_csv("zhihu_answers_full.csv", index=False, encoding="utf-8-sig")
print(f"✅ Done. {len(df)} answers saved to 'zhihu_answers_full.csv'")

driver.quit()


🔐 Log in to Zhihu manually, then press Enter here to continue... 


✅ No more content to scroll.
🔁 Finished scrolling after 17 scrolls.
✅ Done. 11 answers saved to 'zhihu_answers_full.csv'


In [142]:
from IPython.display import FileLink

# Display a clickable download link
FileLink("zhihu_answers_full.csv")