# 아산병원 질환백과 기본정보 크롤링

In [2]:
import requests
from bs4 import BeautifulSoup
import csv

# CSV 파일에 저장할 헤더
header = ['disease_id', 'disease_name', 'disease_link', 'disease_img', 'symptoms', 'symptom_ids', 'related_diseases', 'related_disease_ids', 'department', 'synonyms']

# CSV 파일 열기
with open('./../Data/diseases.csv', 'w', newline='', encoding='utf-8') as csvfile:
    writer = csv.writer(csvfile)
    writer.writerow(header)

    # 질병 종류별 순환
    disease_kind_ids = ['C000001', 'C000002', 'C000003', 'C000004', 'C000005', 'C000006', 'C000007', 'C000008', 'C000009', 'C000010', 'C000011', 'C000012', 'C000013', 'C000014', 'C000015', 'C000016', 'C000017', 'C000018', 'C000019', 'C000020']

    for kind_id in disease_kind_ids:  # 질병 종류는 C000001부터 C000020까지 순환
        print(kind_id, " 시작")
        url = f'https://www.amc.seoul.kr/asan/healthinfo/disease/diseaseList.do?pageIndex=1&partId=&diseaseKindId={kind_id}&searchKeyword=%EA%B0%90%EC%97%BC%EC%84%B1%EC%A7%88%ED%99%98'
        response = requests.get(url)
        soup = BeautifulSoup(response.text, 'html.parser')

        # 페이지 순환
        for i in range(1, 11):  # pageIndex는 1부터 10까지 순환
            url = f'https://www.amc.seoul.kr/asan/healthinfo/disease/diseaseList.do?pageIndex={i}&partId=&diseaseKindId={kind_id}&searchKeyword=%EA%B0%90%EC%97%BC%EC%84%B1%EC%A7%88%ED%99%98'
            response = requests.get(url)
            soup = BeautifulSoup(response.text, 'html.parser')

            # 질병 목록 추출
            # disease_list = soup.select('.listCont .contBox')
            disease_list = soup.select('#listForm > div > div > ul > li')

            # 각 질병에 대한 정보 추출 및 CSV에 추가
            for disease in disease_list:
                disease_name_elem = disease.select_one('.contTitle a')
                disease_name = disease.select_one('.contTitle a').text.strip()
                disease_link = disease_name_elem['href']
                disease_id = disease_link.split('=')[-1] 
                disease_img = disease.select_one('.imgBox img')['src']

                # 증상 및 관련질환 추출
                symptoms_elem = disease.select_one('dt:contains("증상") + dd')
                if symptoms_elem:
                    symptoms = ', '.join([a.text.strip() for a in symptoms_elem.select('a')])
                    symptom_ids = [a['href'].split('=')[-1] for a in symptoms_elem.select('a')]
                else:
                    symptoms = ''
                    symptom_ids = ''
                
                related_diseases_elem = disease.select_one('dt:contains("관련질환") + dd')
                if related_diseases_elem:
                    related_diseases = ', '.join([a.text.strip() for a in related_diseases_elem.select('a')])
                    related_disease_ids = [a['href'].split('=')[-1] for a in related_diseases_elem.select('a')] if related_diseases_elem else ''
                else:
                    related_diseases = ''
                    related_disease_ids = ''

                # 진료과 및 동의어 추출
                department_elem = disease.select_one('dt:contains("진료과") + dd')
                department = department_elem.text.strip() if department_elem and department_elem.text.strip() else None
                synonyms_elem = disease.select_one('dt:contains("동의어") + dd')
                synonyms = synonyms_elem.text.strip() if synonyms_elem else ''

                # CSV 파일에 추가
                writer.writerow([disease_id, disease_name, "https://www.amc.seoul.kr" + disease_link, "https://www.amc.seoul.kr" + disease_img, symptoms, ','.join(symptom_ids), related_diseases, ','.join(related_disease_ids), department, synonyms])
        print(kind_id, " 끝")

print("CSV 파일이 성공적으로 생성되었습니다.")


C000001  시작
C000001  끝
CSV 파일이 성공적으로 생성되었습니다.
