In [None]:
import requests
from lxml import etree
import pandas as pd
import numpy as np
import time
import random

In [None]:
def parse_url(url):
    # 输入链接，返回解析后的html
    headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.183 Safari/537.36 Edg/86.0.622.63"}
    response = requests.get(url=url, headers=headers)
    content = response.content.decode('utf-8', 'ignore')
    html = etree.HTML(content)
    return html

In [None]:
def get_base_info(page_url):
    # 获取基础信息
    html = parse_url(page_url)
    titles = html.xpath(
        '//div[@class="list-info"]/h2[@class="title"]/a/text()')  # 标题
    urls = html.xpath(
        '//div[@class="list-info"]/h2[@class="title"]/a/@href')  # 链接
    total_prices = html.xpath(
        '//div[@class="price"]/p[@class="sum"]/b/text()')  # 总价
    unit_prices = html.xpath(
        '//div[@class="price"]/p[@class="unit"]/text()')  # 均价
    base_infos = []  # 使用一个列表存储所有信息
    for title, url, total_price, unit_price in zip(titles, urls, total_prices, unit_prices):
        # 将信息写入一个字典中
        info = {}
        info['标题'] = title
        if url[0:5] != 'https':  # 有的链接不是https开头的，手动加上
            url = 'https:'+url
        info['链接'] = url.split('?')[0]  # 删掉链接后面跟的参数
        info['总价'] = total_price
        info['均价'] = unit_price
        base_infos.append(info)
    return base_infos

In [None]:
def get_extra_info(info):
    # 进入详情页获取更多信息
    info_url = info['链接']
    html = parse_url(info_url)
    try:
        info['位置1'] = html.xpath(
            '/html/body/div[4]/div[2]/div[2]/ul/li[2]/span[2]/a[1]/text()')[0].strip()
    except:
        info['位置1'] = ''
    try:
        info['位置2'] = html.xpath(
            '/html/body/div[4]/div[2]/div[2]/ul/li[2]/span[2]/a[2]/text()')[0].replace('－', '').strip()
    except:
        info['位置2'] = ''
    # 获取详情页表格中的信息
    info_keys = html.xpath(
        '//*[@id="generalSituation"]//span[@class="mr_25 c_999"]/text()')[1:]
    info_values = html.xpath(
        '//*[@id="generalSituation"]//span[@class="c_000"]')
    info_values = [v.text for v in info_values]
    for key, value in zip(info_keys, info_values):
        info[key] = value

    # 获取小区及周边信息
    try:
        info['小区名'] = html.xpath(
            '//*[@id="xiaoWrap"]/div/div[2]/h3/a/text()')[0].strip()
    except:
        info['小区名'] = ''
    try:
        info['小区均价'] = html.xpath(
            '//*[@id="xiaoWrap"]/div/div[2]/ul/li[1]/span[2]/text()')[0]
    except:
        info['小区均价'] = ''
    try:
        info['物业费'] = html.xpath(
            '//*[@id="xiaoWrap"]/div/div[2]/ul/li[3]/span[2]/text()')[0]
    except:
        info['物业费'] = ''
    try:
        info['容积率'] = html.xpath(
            '//*[@id="xiaoWrap"]/div/div[2]/ul/li[4]/span[2]/text()')[0]
    except:
        info['容积率'] = ''
    try:
        info['绿化率'] = html.xpath(
            '//*[@id="xiaoWrap"]/div/div[2]/ul/li[5]/span[2]/text()')[0]
    except:
        info['绿化率'] = ''
    try:
        info['车位信息'] = html.xpath(
            '//*[@id="xiaoWrap"]/div/div[2]/ul/li[6]/span[2]/text()')[0]

    except:
        info['车位信息'] = ''

    return info

In [None]:
base_url = 'https://xm.58.com/ershoufang/pn'
infos = []
for i in range(1, 7): # 爬取前六页
    time.sleep(random.randint(10, 20))  # 设置休息时间应对反爬
    page_url = base_url+str(i)
    results = get_base_info(page_url)
    infos.extend(results)
    print(f'爬取页面{i}的基础信息成功！')

In [None]:
for i in range(0, len(infos)):
    time.sleep(random.randint(10, 20))
    infos[i] = get_extra_info(infos[i])
    if infos[i]['位置1'] == '' and infos[i]['小区名'] == '':  # 如果这两个值都为空值，说明开始人机验证了
        print(f'爬取第{i}条信息失败,请进行人机验证! ') # 点进去验证后出来记得该for循环的第一个值
        print(infos[i]['链接'])
        data = pd.DataFrame(infos)
        data.to_csv('data2.csv')
        break
    else:
        print("爬取第{}条信息成功：{}".format(i, infos[i]['标题']))

In [None]:
data = pd.DataFrame(infos)
data.to_csv('data2.csv')  # 导出到csv文件