In [None]:
import requests
from bs4 import BeautifulSoup
import json
import time
import pandas as pd  # dùng để lưu CSV

headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64)"
}

base_url = "https://www.vinmec.com/vie/tra-cuu-benh/"
letters = [chr(i) for i in range(ord('a'), ord('z') + 1)]
disease_links = []
graph_data = []

# Bước 1: Crawl danh sách các bệnh từ A-Z
for letter in letters:
    res = requests.get(base_url + letter, headers=headers)
    soup = BeautifulSoup(res.text, "html.parser")

    for link in soup.select("li > a[href^='/vie/benh/']"):
        href = link.get("href")
        full_url = "https://www.vinmec.com" + href
        if full_url not in disease_links:
            disease_links.append(full_url)
    time.sleep(1)

print(f"✅ Tổng số bệnh: {len(disease_links)}")

# Bước 2: Crawl từng trang bệnh
for idx, link in enumerate(disease_links):
    try:
        res = requests.get(link, headers=headers)
        soup = BeautifulSoup(res.text, "html.parser")

        title_tag = soup.select_one("div.f30.bold.mb2")
        if title_tag:
            raw_title = title_tag.text.strip()
            disease_name = raw_title.split(":")[0].strip()
        else:
            disease_name = "No Title"

        # ✅ Layout nội dung bệnh
        blocks = soup.select("section.detail_sick .item_detial_sick")
        if not blocks:
            print(f"⚠️ Không tìm thấy nội dung trong: {link}")
            continue

        overview, causes, symptoms, treatments, drugs, prevention = [], [], [], [], [], []

        for block in blocks:
            title_el = block.select_one("h2.title_detail_sick")
            content_div = block.select_one("div.body.collapsible-target")

            if not title_el or not content_div:
                continue

            section = title_el.text.lower()

            content = []
            seen = set()
            for tag in content_div.find_all(["p", "li"]):
                text = tag.get_text(strip=True)
                if text and text not in seen:
                    content.append(text)
                    seen.add(text)

            if "tổng quan" in section:
                overview += content
            elif "nguyên nhân" in section:
                causes += content
            elif any(k in section for k in ["triệu chứng", "dấu hiệu", "biểu hiện","lây truyền"]):
                symptoms += content
            elif any(k in section for k in ["điều trị", "phác đồ", "xử trí"]):
                treatments += content
            # elif "thuốc" in section:
            #     drugs += content
            elif "chẩn đoán" in section:
                diagnosis += content
            elif "phòng ngừa" in section:
                prevention += content

        graph_data.append({
            "disease": disease_name,
            "overview": overview,
            "causes": causes,
            "symptoms": symptoms,
            "treatments": treatments,
            "drugs": drugs,
            "prevention": prevention,
            "diagnosis": diagnosis,
            "url": link
        })

        print(f"✅ {idx+1}/{len(disease_links)}: {disease_name}")
        time.sleep(1)

    except Exception as e:
        print(f"❌ Error at {link}: {e}")

# Bước 3: Ghi ra JSON
with open("vinmec_medical_graph.json", "w", encoding="utf-8") as f:
    json.dump(graph_data, f, ensure_ascii=False, indent=2)

# Bước 4: Ghi ra CSV (nội dung dạng chuỗi gộp)
df = pd.DataFrame([
    {
        "disease": item["disease"],
        "overview": "\n".join(item["overview"]),
        "causes": "\n".join(item["causes"]),
        "symptoms": "\n".join(item["symptoms"]),
        "treatments": "\n".join(item["treatments"]),
        # "drugs": "\n".join(item["drugs"]),
        "prevention": "\n".join(item["prevention"]),
        "diagnosis": "\n".join(item["diagnosis"]),
        "url": item["url"]
    }
    for item in graph_data
])

df.to_csv("vinmec_medical_graph.csv", index=False, encoding="utf-8-sig")

print(f"\n🎉 DONE! Đã lưu {len(graph_data)} bệnh vào:")
print("- vinmec_medical_graph.json")
print("- vinmec_medical_graph.csv")


In [None]:
import pandas as pd


In [None]:
df =pd.read_csv("vinmec_medical_graph.csv", encoding="utf-8-sig")

In [None]:
df = df.dropna()
df = df.drop_duplicates(subset=["disease"])

In [None]:
len(df)
