# Website Scrap for fetch Q&A Dataset

In [None]:
# ติดตั้ง dependencies (สำหรับ Colab)
!apt-get update
!apt install chromium-chromedriver -y
!pip install selenium

# ------------------------------
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import csv
import re

# Chrome headless
chrome_options = Options()
chrome_options.add_argument("--headless=new")
chrome_options.add_argument("--no-sandbox")
chrome_options.add_argument("--disable-dev-shm-usage")

driver = webdriver.Chrome(options=chrome_options)
wait = WebDriverWait(driver, 10)

BASE_URL = "https://www.agnoshealth.com"
query = "อ่อนเพลีย"
all_threads = []

for page in range(1, 6):  # ทดลอง 5 หน้าแรกก่อน
    search_url = f"{BASE_URL}/forums/search?page={page}&q={query}"
    print(f"กำลังดึงหน้า {page} ...")
    driver.get(search_url)

    try:
        thread_elements = wait.until(
            EC.presence_of_all_elements_located((By.XPATH, '//a[contains(@href,"/forums/") and .//span]'))
        )
    except:
        print(f"⚠️ ไม่เจอ thread หน้า {page}")
        continue

    thread_data = []
    for t in thread_elements:
        try:
            title = t.text.strip()
            link = t.get_attribute("href")
            if link and "/forums/" in link:
                thread_data.append({"title": title, "link": link})
        except:
            continue

    for td in thread_data:
        thread_title = td["title"]
        thread_link = td["link"]

        driver.get(thread_link)

        # ✅ Gender + Age
        gender, age = "", ""
        try:
            info_el = driver.find_element(
                By.XPATH,
                '/html/body/div/div/div/div[1]/div/main/div/div/section[1]/div[2]/a/article/div[1]/div[1]/p'
            )
            info_text = info_el.text.strip()
            if "ชาย" in info_text:
                gender = "ชาย"
            elif "หญิง" in info_text:
                gender = "หญิง"

            num_match = re.search(r"\d+", info_text)
            if num_match:
                age = num_match.group()
        except:
            pass

        # ✅ Question
        try:
            question_el = wait.until(
                EC.presence_of_element_located((By.XPATH,
                    '/html/body/div/div/div/div[1]/div/main/div/div/section[1]/div[2]/a/article/div[1]/div[4]/p'
                ))
            )
            question = question_el.text.strip()
        except:
            question = ""

        # ✅ Keys
        keys_dict = {}
        try:
            key_elements = driver.find_elements(
                By.XPATH,
                '/html/body/div/div/div/div[1]/div/main/div/div/section[1]/div[2]/a/article/div[1]/div[3]/ul/li'
            )
            for idx, li in enumerate(key_elements, start=1):
                if idx > 4:
                    break
                keys_dict[f"key{idx}"] = li.text.strip()
        except:
            pass

        # ✅ Symptom
        try:
            symptom_el = driver.find_element(
                By.XPATH,
                '/html/body/div/div/div/div[1]/div/main/div/div/section[1]/div[2]/a/article/div[1]/div[2]/p'
            )
            symptom = symptom_el.text.strip()
        except:
            symptom = ""

        # ✅ Answers
        answers_list = []
        try:
            answer_elements = driver.find_elements(
                By.XPATH,
                '/html/body/div/div/div/div[1]/div/main/div/div/section[1]/ul/li/p'
            )
            for ans in answer_elements:
                content = ans.text.strip()
                if content:
                    answers_list.append(content)
        except:
            pass

        # ✅ Doctor
        try:
            doctor_el = driver.find_element(
                By.XPATH,
                '/html/body/div/div/div/div[1]/div/main/div/div/section[1]/ul/li/div[2]/div[2]/p[1]'
            )
            doctor = doctor_el.text.strip()
        except:
            doctor = ""

        # รวมข้อมูล
        thread_info = {
            "title": thread_title,
            "gender": gender,
            "age": age,
            "question": question,
            "symptom": symptom,
            "answers": " | ".join(answers_list),
            "doctor": doctor,
            "link": thread_link
        }
        thread_info.update(keys_dict)  # เพิ่ม key1..key4

        all_threads.append(thread_info)

# ✅ สลับลำดับคอลัมน์
keys = ["title", "gender", "age", "question", "key1", "key2", "key3", "key4", "symptom", "answers", "doctor", "link"]

with open("agnos_forum_xpath.csv", "w", encoding="utf-8", newline="") as f:
    writer = csv.DictWriter(f, fieldnames=keys)
    writer.writeheader()
    for row in all_threads:
        for k in ["key1", "key2", "key3", "key4"]:
            if k not in row:
                row[k] = ""
        writer.writerow(row)

driver.quit()
print("✅ ดึงข้อมูลเสร็จและบันทึก agnos_forum_xpath.csv")

0% [Working]            Hit:1 http://archive.ubuntu.com/ubuntu jammy InRelease
0% [Connecting to security.ubuntu.com (185.125.190.39)] [Connecting to cloud.r-                                                                               Get:2 http://archive.ubuntu.com/ubuntu jammy-updates InRelease [128 kB]
0% [2 InRelease 25.8 kB/128 kB 20%] [Connecting to security.ubuntu.com (185.125                                                                               Get:3 https://cli.github.com/packages stable InRelease [3,917 B]
0% [2 InRelease 66.3 kB/128 kB 52%] [Connecting to security.ubuntu.com (185.1250% [2 InRelease 70.6 kB/128 kB 55%] [Connecting to security.ubuntu.com (185.1250% [Waiting for headers] [Waiting for headers] [Connecting to cloud.r-project.o                                                                               Get:4 http://archive.ubuntu.com/ubuntu jammy-backports InRelease [127 kB]
Get:5 https://developer.download.nvidia.com/compute/cuda/repos/u

# Cleaning Data

In [None]:
import pandas as pd

# โหลดไฟล์ CSV
df = pd.read_csv("agnos_forum_xpath.csv")

# ดูชื่อคอลัมน์
print(df.columns)

# ลบคอลัมน์ title, gender, age, link, key4
columns_to_keep = ['question', 'key1', 'key2', 'key3', 'symptom', 'answers', 'doctor']
df = df[columns_to_keep]


# จัดการ Missing / Null
df = df.dropna(subset=['question', 'answers'])
for col in ['key1', 'key2', 'key3', 'symptom', 'doctor']:
    df[col] = df[col].fillna("")

#Clean ข้อมูล whit space
df = df.applymap(lambda x: x.strip() if isinstance(x, str) else x)

#รวมข้อความสำหรับ RAG : question, answers, symptom, key1-3 เป็น text เดียว เพื่อใช้สร้าง embeddings
def combine_text(row):
    parts = [
        f"Question: {row['question']}",
        f"Answers: {row['answers']}",
        f"Symptom: {row['symptom']}",
        f"Key1: {row['key1']}",
        f"Key2: {row['key2']}",
        f"Key3: {row['key3']}",
        f"Doctor: {row['doctor']}"
    ]
    return " | ".join([p for p in parts if p])

df['combined_text'] = df.apply(combine_text, axis=1)


df.to_csv("rag_ready_agnos.csv", index=False)
print("✅ Dataset พร้อมใช้งานสำหรับ RAG Chatbot")

Index(['title', 'gender', 'age', 'question', 'key1', 'key2', 'key3', 'key4',
       'symptom', 'answers', 'doctor', 'link'],
      dtype='object')
✅ Dataset พร้อมใช้งานสำหรับ RAG Chatbot


  df = df.applymap(lambda x: x.strip() if isinstance(x, str) else x)


# Pipeline+RAG

In [None]:
!pip install pandas sentence-transformers faiss-cpu langchain

# 1️⃣ โหลดไลบรารี
import pandas as pd
from sentence_transformers import SentenceTransformer
import faiss
import gradio as gr
import string

# 2️⃣ โหลด CSV
df = pd.read_csv("rag_ready_agnos.csv")
df['combined_keywords'] = df[['key1','key2','key3']].fillna('').agg(' '.join, axis=1)

# 3️⃣ สร้าง embeddings + FAISS index
model = SentenceTransformer('all-MiniLM-L6-v2')
corpus = df['combined_keywords'].tolist()
corpus_embeddings = model.encode(corpus, convert_to_numpy=True)

dim = corpus_embeddings.shape[1]
index = faiss.IndexFlatL2(dim)
index.add(corpus_embeddings)

# 4️⃣ ฟังก์ชัน highlight
def highlight_answer(question, answer, keywords=None):
    question_words = set(question.lower().translate(str.maketrans('', '', string.punctuation)).split())
    keywords_words = set()
    if keywords:
        keywords_words = set(keywords.lower().translate(str.maketrans('', '', string.punctuation)).split())

    highlight_words = question_words.union(keywords_words)

    highlighted = []
    for word in answer.split():
        w_clean = word.lower().translate(str.maketrans('', '', string.punctuation))
        if w_clean in highlight_words:
            highlighted.append(f"**{word}**")
        else:
            highlighted.append(word)
    return ' '.join(highlighted)

# 5️⃣ ฟังก์ชัน Retrieval
def retrieve_symptom_with_answer(question, top_k=3):
    q_emb = model.encode([question], convert_to_numpy=True)
    D, I = index.search(q_emb, top_k)

    results = df.iloc[I[0]].copy()
    results['distance'] = D[0]

    top_results = results[['symptom','combined_keywords','answers']].rename(
        columns={'answers':'answer', 'combined_keywords':'keywords'}
    )

    top_results['answer'] = top_results.apply(
        lambda row: highlight_answer(question, row['answer'], row['keywords']), axis=1
    )

    return top_results[['symptom','answer']]

# 6️⃣ ฟังก์ชันสำหรับ Gradio
def chat_interface(user_input):
    # ตรวจสอบว่าผู้ใช้เว้นว่างหรือไม่
    if not user_input.strip():
        return "ไม่พบอาการที่ใกล้เคียง กรุณาลองพิมพ์คำถามใหม่"

    matched = retrieve_symptom_with_answer(user_input, top_k=3)

    if matched.empty:
        return "ไม่พบอาการที่ใกล้เคียง กรุณาลองพิมพ์คำถามใหม่"

    response = "จากการประเมินอาการเบื้องต้น พบอาการใกล้เคียง 3 อาการ ดังนี้:\n\n"

    for i, row in matched.iterrows():
        response += f"🔹 จากการประเมินอาการเบื้องต้น คาดว่ามี/เป็น: {row['symptom']}\nคำแนะนำ: {row['answer']}\n\n"

    response += "หมายเหตุ: แชทบอทนี้เป็นการวประเมินอาการเบื้องต้น ซึ่งอาจจะใช่หรือไม่ใช่ก็ได้ ควรติดต่อพบแพทย์เพื่อรับการรักษาอีกครั้ง ไม่ควรรักษาด้วยตัวเอง"

    return response

print("✅ RAG Complete")

✅ RAG Complete


# Gradio Interface

In [None]:
# 7️⃣ สร้าง Gradio interface
iface = gr.Interface(
    fn=chat_interface,
    inputs=gr.Textbox(lines=2, placeholder="พิมพ์อาการหรือความรู้สึกของคุณ..."),
    outputs="text",
    title="Medical Symptom Chatbot (RAG)",
    description="อธิบายอาการหรือความรู้สึก เพื่อประเมินเบื้องต้น "
)

iface.launch()

It looks like you are running Gradio on a hosted Jupyter notebook, which requires `share=True`. Automatically setting `share=True` (you can turn this off by setting `share=False` in `launch()` explicitly).

Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
* Running on public URL: https://9c1e4286361b54d1d9.gradio.live

This share link expires in 1 week. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)


