In [1]:
import requests

url = "https://item.jd.com/100095516761.html"

headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"
}

response = requests.get(url, headers=headers)

if response.status_code == 200:
    print("请求成功!")
    
    # 从URL中提取文件名
    filename = url.split("/")[-1]
    
    # 确保文件有.html后缀
    if not filename.endswith('.html'):
        filename += '.html'
    
    # 保存响应内容到文件
    with open(filename, 'w', encoding='utf-8') as f:
        f.write(response.text)
    
    print(f"页面内容已保存到当前文件夹: {filename}")
else:
    print("请求失败,状态码:", response.status_code)

请求成功!
页面内容已保存到当前文件夹: 100095516761.html


In [3]:
from selenium import webdriver
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.wait import WebDriverWait
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from time import sleep
from lxml import etree
import pandas as pd

# 初始化相关参数
url = 'https://www.jd.com/'
pages = 2  # 要抓取的产品页数
comment_pages = 5  # 要抓取的评论页数

# 启动 Chrome 浏览器
driver = webdriver.Chrome()

# 创建 WebDriverWait 对象
wait = WebDriverWait(driver, 10)

# 用于存储产品和评论的数据
data = pd.DataFrame()
comments = pd.DataFrame()

# 打开京东首页
driver.get(url)

# 等待并点击登录按钮
login_button = wait.until(EC.element_to_be_clickable((By.CLASS_NAME, 'link-login')))
login_button.click()

print("请使用京东 App 扫描二维码登录")
input("登录完成后按回车键继续...")

# 搜索产品
key = '华为p70 pro'
search_box = wait.until(EC.presence_of_element_located((By.ID, 'key')))
search_box.clear()
search_box.send_keys(key)  # 输入搜索关键词
search_box.send_keys(Keys.RETURN)  # 回车进行搜索
sleep(3)

# 定义函数以向下滚动页面
def scroll_down():
    for _ in range(2):
        driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
        sleep(3)

# 定义函数以提取页面上的产品信息
def get_content():
    global data  # 使用全局变量来存储产品数据
    html = etree.HTML(driver.page_source)
    items = html.xpath('//div[@class="gl-i-wrap"]')
    for item in items:
        D = {}
        # 提取价格
        D['price'] = item.xpath('.//div[@class="p-price"]/strong/i/text()')[0]
        # 提取评论数
        D['comment'] = item.xpath('.//div[@class="p-commit"]/strong/a/text()')[0]
        # 提取店铺名称
        shopname = item.xpath('.//div[@class="p-shop"]/span/a/text()')
        D['shopname'] = shopname[0] if shopname else 'None'
        # 提取产品链接
        D['URL'] = 'https:' + item.xpath('.//div[@class="p-img"]/a/@href')[0]
        # 提取产品标题
        title = item.xpath('.//div[@class="p-name p-name-type-2"]/a/em')[0].xpath('string(.)').strip()
        D['title'] = title
        # 提取图片链接
        image_url = item.xpath('.//div[@class="p-img"]/a/img/@data-lazy-img')
        D['pnglink'] = 'https:' + image_url[0] if image_url and image_url[0] != 'done' else 'https:' + item.xpath('.//div[@class="p-img"]/a/img/@src')[0]
        
        # 将数据添加到 DataFrame 中
        data = pd.concat([data, pd.DataFrame([D])])

# 定义函数以点击下一页按钮
def next_page():
    try:
        next_button = wait.until(EC.element_to_be_clickable((By.CSS_SELECTOR, '.pn-next:not(.disabled)')))
        driver.execute_script("arguments[0].click();", next_button)  # 点击下一页按钮
        sleep(4)
        return True
    except:
        print("No more pages or unable to click next page button")
        return False

# 抓取指定页数的产品信息
for _ in range(pages):
    scroll_down()  # 向下滚动页面
    get_content()  # 提取页面内容
    if not next_page():  # 如果无法点击下一页按钮，停止抓取
        break

# 获取产品评论（针对第一页的第一个产品）
if not data.empty:
    first_product_url = data.iloc[0]['URL']
    driver.get(first_product_url)  # 打开产品页面
    sleep(3)

    # 切换到评论标签
    comment_tab = wait.until(EC.element_to_be_clickable((By.CSS_SELECTOR, 'li[data-anchor="#comment"]')))
    comment_tab.click()
    sleep(3)

    # 定义函数以提取评论内容
    def extract_comments():
        global comments  # 使用全局变量来存储评论数据
        comment_items = driver.find_elements(By.CSS_SELECTOR, 'div.comment-item')
        for comment in comment_items:
            C = {}
            C['content'] = comment.find_element(By.CSS_SELECTOR, 'p.comment-con').text  # 提取评论内容
            C['time'] = comment.find_element(By.CSS_SELECTOR, 'span.comment-time').text  # 提取评论时间
            C['user'] = comment.find_element(By.CSS_SELECTOR, 'div.user-info').text  # 提取用户名
            C['score'] = len(comment.find_elements(By.CSS_SELECTOR, 'span.star-red'))  # 提取评分
            comments = pd.concat([comments, pd.DataFrame([C])])

    # 定义函数以点击下一页的评论按钮
    def next_comment_page():
        try:
            next_button = wait.until(EC.element_to_be_clickable((By.CSS_SELECTOR, 'a.ui-pager-next')))
            if 'disabled' not in next_button.get_attribute('class'):
                next_button.click()
                sleep(3)
                return True
            else:
                return False
        except:
            return False

    # 抓取指定页数的评论
    for _ in range(comment_pages):
        extract_comments()  # 提取评论
        if not next_comment_page():  # 如果无法点击下一页评论按钮，停止抓取
            break

# 关闭浏览器
driver.quit()

# 打印产品和评论数据
print("Products:")
print(data)
print("\nComments:")
print(comments)

# 保存产品和评论数据到 CSV 文件
data.to_csv('jd_products.csv', index=False, encoding='utf-8-sig')
comments.to_csv('jd_comments.csv', index=False, encoding='utf-8-sig')


请使用京东 App 扫描二维码登录


KeyboardInterrupt: Interrupted by user

以上代码虽然爬取到了商品 但是却没有拿到评论 原因是jd认为我们是在做爬虫 下面我们通过设置options = uc.ChromeOptions()
    options.add_argument("--disable-gpu")
    options.add_argument("--no-sandbox")
    options.add_argument("--disable-extensions")
    options.add_argument("--window-size=1920,1080")
    来绕过爬虫 拿到评论信息

In [8]:
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium_stealth import stealth
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException, WebDriverException, NoSuchElementException
from webdriver_manager.chrome import ChromeDriverManager
from bs4 import BeautifulSoup
import pandas as pd
import time
import random
import logging
import traceback

logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

def setup_driver():
    options = webdriver.ChromeOptions()
    options.add_argument("--disable-gpu")
    options.add_argument("--no-sandbox")
    options.add_argument("--disable-extensions")
    options.add_argument("--window-size=1920,1080")
    options.add_experimental_option("excludeSwitches", ["enable-automation"])
    options.add_experimental_option('useAutomationExtension', False)
    
    try:
        service = Service(ChromeDriverManager().install())
        driver = webdriver.Chrome(service=service, options=options)
        
        stealth(driver,
                languages=["zh-CN", "zh"],
                vendor="Google Inc.",
                platform="Win32",
                webgl_vendor="Intel Inc.",
                renderer="Intel Iris OpenGL Engine",
                fix_hairline=True,
        )
        
        logging.info("成功创建 Chrome 驱动")
        return driver
    except Exception as e:
        logging.error(f"创建驱动失败: {e}")
        logging.error(traceback.format_exc())
        raise

def open_product_page(driver, product_id):
    url = f"https://item.jd.com/{product_id}.html"
    try:
        driver.get(url)
        logging.info(f"已打开商品页面: {url}")
        time.sleep(random.uniform(3, 5))
    except Exception as e:
        logging.error(f"打开商品页面失败: {e}")
        raise

def wait_for_login(driver):
    input("请在浏览器中登录，然后按回车键继续...")
    logging.info("用户确认已登录")

def navigate_to_comments(driver, wait):
    try:
        # 尝试多个可能的选择器
        selectors = [
            'li[data-anchor="#comment"]',
            'a[href="#comment"]',
            'div[id="comment"]',
            'div.tab-main:nth-child(2)'
        ]
        
        for selector in selectors:
            try:
                comment_tab = wait.until(EC.element_to_be_clickable((By.CSS_SELECTOR, selector)))
                comment_tab.click()
                logging.info(f"已点击评论标签，使用选择器: {selector}")
                break
            except:
                continue
        else:
            raise Exception("无法找到评论标签")
        
        time.sleep(5)
        driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
        time.sleep(2)
        
    except Exception as e:
        logging.error(f"导航到评论区时出错: {e}")
        raise

def save_page_source(driver):
    try:
        with open("jd_comment_page.html", "w", encoding="utf-8") as f:
            f.write(driver.page_source)
        logging.info("已保存页面源代码到 jd_comment_page.html")
    except Exception as e:
        logging.error(f"保存页面源代码失败: {e}")

def extract_comments(driver, wait, comments):
    try:
        # 尝试多个可能的选择器
        selectors = [
            'div.comment-item',
            'div.comment-list div',
            'div[data-guid]',
            'div.comment'
        ]
        
        comment_items = []
        for selector in selectors:
            comment_items = driver.find_elements(By.CSS_SELECTOR, selector)
            if comment_items:
                logging.info(f"找到评论元素，使用选择器: {selector}")
                break
        
        if not comment_items:
            logging.warning("未找到评论元素，可能页面结构已改变")
            save_page_source(driver)
            return
        
        for item in comment_items:
            try:
                content = item.find_element(By.CSS_SELECTOR, 'p.comment-con, div.comment-content').text
                time_elem = item.find_element(By.CSS_SELECTOR, 'span.comment-time, span.time').text
                user = item.find_element(By.CSS_SELECTOR, 'div.user-info, span.user-name').text
                stars = len(item.find_elements(By.CSS_SELECTOR, 'span.star-red, i.star-yellow'))
                
                comment = {
                    'content': content.strip() if content else "N/A",
                    'time': time_elem.strip() if time_elem else "N/A",
                    'user': user.strip() if user else "N/A",
                    'score': stars if stars else 0
                }
                comments.append(comment)
            except Exception as e:
                logging.error(f"提取单条评论时出错: {e}")
        
        logging.info(f"已提取 {len(comment_items)} 条评论")
    except Exception as e:
        logging.error(f"提取评论过程中发生错误: {e}")
        raise

def next_page(driver, wait):
    try:
        # 尝试多个可能的选择器
        selectors = [
            'a.ui-pager-next',
            'a.next',
            'a[aria-label="Next"]'
        ]
        
        for selector in selectors:
            try:
                next_button = wait.until(EC.element_to_be_clickable((By.CSS_SELECTOR, selector)))
                if 'disabled' not in next_button.get_attribute('class'):
                    next_button.click()
                    logging.info(f"已点击下一页，使用选择器: {selector}")
                    time.sleep(random.uniform(3, 5))
                    return True
                else:
                    logging.info("已到达最后一页")
                    return False
            except:
                continue
        
        logging.error("无法找到下一页按钮")
        return False
    except Exception as e:
        logging.error(f"翻页时发生错误: {e}")
        return False

def main():
    product_id = '100095516761'  # 更换为您想爬取的商品ID
    comment_pages = 5
    comments = []
    driver = None

    try:
        driver = setup_driver()
        wait = WebDriverWait(driver, 20)

        open_product_page(driver, product_id)
        wait_for_login(driver)  # 等待登录确认
        navigate_to_comments(driver, wait)
        
        for page in range(comment_pages):
            logging.info(f"正在爬取第 {page + 1} 页评论")
            extract_comments(driver, wait, comments)
            if not next_page(driver, wait):
                break

    except Exception as e:
        logging.error(f"爬取过程中发生错误: {e}")
        logging.error(traceback.format_exc())
    finally:
        if driver:
            driver.quit()
            logging.info("已关闭浏览器")

    if comments:
        comments_df = pd.DataFrame(comments)
        comments_df.to_csv('jd_comments.csv', index=False, encoding='utf-8-sig')
        logging.info(f"成功爬取 {len(comments_df)} 条评论，已保存到 jd_comments.csv 文件")
        logging.info("\n评论示例:")
        logging.info(comments_df.head().to_string())
    else:
        logging.warning("未成功爬取到评论")

if __name__ == "__main__":
    main()

2024-10-22 03:25:11,778 - INFO - Get LATEST chromedriver version for google-chrome
2024-10-22 03:25:11,961 - INFO - Get LATEST chromedriver version for google-chrome
2024-10-22 03:25:12,132 - INFO - Driver [C:\Users\Administrator\.wdm\drivers\chromedriver\win64\130.0.6723.58\chromedriver-win32/chromedriver.exe] found in cache
2024-10-22 03:25:13,448 - INFO - 成功创建 Chrome 驱动
2024-10-22 03:25:15,086 - INFO - 已打开商品页面: https://item.jd.com/100095516761.html


请在浏览器中登录，然后按回车键继续... 


2024-10-22 03:25:42,940 - INFO - 用户确认已登录
2024-10-22 03:25:42,995 - INFO - 已点击评论标签，使用选择器: li[data-anchor="#comment"]
2024-10-22 03:25:50,002 - INFO - 正在爬取第 1 页评论
2024-10-22 03:25:50,030 - INFO - 找到评论元素，使用选择器: div.comment
2024-10-22 03:25:50,039 - ERROR - 提取单条评论时出错: Message: no such element: Unable to locate element: {"method":"css selector","selector":"p.comment-con, div.comment-content"}
  (Session info: chrome=130.0.6723.59); For documentation on this error, please visit: https://www.selenium.dev/documentation/webdriver/troubleshooting/errors#no-such-element-exception
Stacktrace:
	GetHandleVerifier [0x003674D3+23139]
	(No symbol) [0x002EFCA4]
	(No symbol) [0x001CC2A3]
	(No symbol) [0x0020DC66]
	(No symbol) [0x0020DEAB]
	(No symbol) [0x00203CD1]
	(No symbol) [0x0022FEC4]
	(No symbol) [0x00203BF4]
	(No symbol) [0x00230114]
	(No symbol) [0x00249564]
	(No symbol) [0x0022FC16]
	(No symbol) [0x0020216C]
	(No symbol) [0x002030FD]
	GetHandleVerifier [0x005FBAA3+2727987]
	GetHandleVerifier [0x

我们主要通过以下几种方式来绕过反爬虫机制：

1. 使用 undetected_chromedriver：
   这是最关键的一步。undetected_chromedriver 是一个专门设计用来绕过网站对 Selenium 自动化的检测的工具。它修改了 ChromeDriver 的特征，使其更难被识别为自动化工具。

2. 模拟真实用户行为：
   - 添加随机延时：使用 `random.uniform()` 在操作之间添加随机时间间隔，模拟人类操作的不规则性。
   - 滚动页面：使用 `driver.execute_script()` 执行滚动操作，模拟用户浏览页面的行为。

3. 自定义 Chrome 选项：
   - 禁用 GPU 加速（`--disable-gpu`）
   - 禁用沙盒模式（`--no-sandbox`）
   - 禁用扩展（`--disable-extensions`）
   - 设置窗口大小（`--window-size=1920,1080`）
   这些设置可以帮助浏览器看起来更像真实用户的浏览器。

4. 使用真实的 User-Agent：
   虽然在这个脚本中没有明确设置，但 undetected_chromedriver 通常会使用一个真实的 User-Agent。

5. 分步加载和等待：
   在导航到评论页面后，我们添加了额外的等待时间和滚动操作，确保页面完全加载。

6. 错误处理和重试机制：
   虽然这个脚本中没有明确的重试逻辑，但我们添加了大量的错误处理，这有助于脚本在遇到临时问题时能够继续运行。

7. 避免频繁请求：
   通过限制爬取的页数（在这个例子中是5页），我们减少了被检测为机器人的可能性。

8. 保存页面源代码：
   虽然这不直接帮助绕过反爬虫，但它允许我们在遇到问题时分析页面结构，有助于调整爬虫策略。

这些方法综合起来，使得我们的爬虫行为更接近于真实用户，从而降低了被检测和阻止的可能性。然而，需要注意的是，网站的反爬虫机制可能会不断更新，所以这种方法可能需要定期调整以保持有效。

In [12]:
!pip install webdriver_manager

Collecting webdriver_manager
  Downloading webdriver_manager-4.0.2-py2.py3-none-any.whl.metadata (12 kB)
Downloading webdriver_manager-4.0.2-py2.py3-none-any.whl (27 kB)
Installing collected packages: webdriver_manager
Successfully installed webdriver_manager-4.0.2
