In [56]:
import requests
import re
import time
from bs4 import BeautifulSoup 
import pandas as pd


In [57]:
headers = {'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
            'Accept-Encoding':'gzip, deflate, br',
            'Accept-Language':'en-US,en;q=0.9,zh-CN;q=0.8,zh;q=0.7',
            'User-Agent':'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.97 Safari/537.36'}



In [58]:
def get_all_keshi_code():
    """
    Get every department's code in html
    e.g., Cardiology's code：1010000 
    
    Returns
    -------
    all_code : a list of dept's code on the website

    """
    url = 'https://www.haodf.com/keshi/list.htm'
    response = requests.get(url, headers=headers)
    html = response.text
    all_code = re.findall(r"/keshi/(\d+).htm",html,re.S)
    return all_code




In [1]:
def load_doctor_info(code,page_size):
    
    """
    Extract doctor's information under each department

    Parameters
    ----------
    code : int
        dept's code
    page_size : int
        number of pages that each dept you want to scrap 
        as large as possible such as 10000

    Returns
    -------
    data : [{"name":XX,“title": XX,"hospital":XX, "dept": XX, "score": XX, "price": XX.}]

    """

    data = []
    
    for i in range(1, page_size+1):
        url = f"https://haoping.haodf.com/keshi/{code}/daifu_all_{i}.htm"
        
        try:
            response = requests.get(url, headers=headers, timeout=20)
        except:
            time.sleep(20)
            response = requests.get(url, headers=headers, timeout=20)
            
        html = response.text
        print(f"page {i} ", url)
        time.sleep(20)
        
        #Save the content in beautiful soup, so we can parse the page later
        soup = BeautifulSoup(html, "lxml")
        good_doctor_list = soup.find_all(class_='good_doctor_list_td') # all the info for all doctor
        
        if not good_doctor_list:
            print("not find data url: ",url)
            time.sleep(20)
            break
        
        for i in range(0, len(good_doctor_list), 4):
            name = good_doctor_list[i].text
            if len(name.split('\n'))>2:
                r = name.split('\n')
                name,title = r[4].strip(),r[11].strip()
            else:
                title = '',''
            hospital = good_doctor_list[i+1].text
            if len(hospital.split('\n'))>1:
                s = hospital.split('\n')
                hospital,dept = s[0].strip(),s[1].strip()
            else:
                dept = ''
            score = good_doctor_list[i+2].text.replace('查看更多推荐>>','').strip()
            price = good_doctor_list[i+3].text.replace("访问个人网站>>",'').strip()
            data.append({"Doctor's Name":name,"Title":title,"Hospital":hospital,"Department":dept,"Recommendation Score":score,"Price":price})
            
    return data
                 
    

In [None]:
# 每个科室爬取的医生列表的页数，这个越大越好，页数过大，没有数据的时候，会自动切换到下一个科室
#Input a page as large as possible, when page exceeds maximum, and it runs out data, then
#it automatically scrap next dept's data
#page_size = 100000


##Input: the size of page you want to download for each dept
#Example
page_size = 3

codes = get_all_keshi_code()
doctors = []
downloaded_codes = []
for code in codes:
    if code not in downloaded_codes:
        doctors += load_doctor_info(code,page_size)
        downloaded_codes.append(code)
        print("downloaded doctors: ", len(doctors))
    
df = pd.DataFrame(doctors)


第1页  https://haoping.haodf.com/keshi/1010000/daifu_all_1.htm
第2页  https://haoping.haodf.com/keshi/1010000/daifu_all_2.htm
downloaded doctors:  60
第1页  https://haoping.haodf.com/keshi/1007000/daifu_all_1.htm
第2页  https://haoping.haodf.com/keshi/1007000/daifu_all_2.htm
downloaded doctors:  120
第1页  https://haoping.haodf.com/keshi/1009000/daifu_all_1.htm
第2页  https://haoping.haodf.com/keshi/1009000/daifu_all_2.htm
downloaded doctors:  180
第1页  https://haoping.haodf.com/keshi/1005000/daifu_all_1.htm
第2页  https://haoping.haodf.com/keshi/1005000/daifu_all_2.htm
downloaded doctors:  240
第1页  https://haoping.haodf.com/keshi/1004000/daifu_all_1.htm
第2页  https://haoping.haodf.com/keshi/1004000/daifu_all_2.htm
downloaded doctors:  300
第1页  https://haoping.haodf.com/keshi/1002000/daifu_all_1.htm
第2页  https://haoping.haodf.com/keshi/1002000/daifu_all_2.htm
downloaded doctors:  360
第1页  https://haoping.haodf.com/keshi/1008000/daifu_all_1.htm
第2页  https://haoping.haodf.com/keshi/1008000/daifu_all_2.h

In [None]:
## Get out replicate doctor info
# 按姓名，医院，科室 3个对爬取到的医生进行去重
df.drop_duplicates(subset=["Doctor's Name",'Hospital','Department'],keep='first',inplace=True)
df.to_excel('doctor_info.xlsx',index=False)