In [9]:
%pip install --quiet requests beautifulsoup4 lxml pandas

import requests, time, re, itertools
from urllib.parse import urljoin, urldefrag, urlparse
from bs4 import BeautifulSoup
import pandas as pd


Note: you may need to restart the kernel to use updated packages.


In [11]:
BASE = "https://mmha.org.my"

HEADERS_HTML = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
    "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
    "Accept-Language": "en-US,en;q=0.9",
}
HEADERS_XML = {
    **HEADERS_HTML,
    "Accept": "application/xml,text/xml;q=0.9,*/*;q=0.8",
}

def fetch(url, headers, timeout=30):
    r = requests.get(url, headers=headers, timeout=timeout, allow_redirects=True)
    ctype = (r.headers.get("Content-Type") or "").lower()
    print(f"[fetch] {r.status_code} {r.url} ({ctype})")
    r.raise_for_status()
    return r.text, ctype, r

def soupify(html, kind="html"):
    parser = "xml" if kind=="xml" else "lxml"
    try:
        return BeautifulSoup(html, parser)
    except Exception:
        return BeautifulSoup(html, "html.parser")

def norm(u, base):
    if not u: return None
    u = urljoin(base, u)
    u, _ = urldefrag(u)
    return u

def same_site(u):
    return u and u.lower().startswith(BASE)

def looks_like_article(u: str) -> bool:
    """Generic heuristic for MMHA pages."""
    if not same_site(u): return False
    low = u.lower()
    if any(low.endswith(ext) for ext in (".pdf",".jpg",".jpeg",".png",".gif",".svg",".webp",".avif",".mp4",".json",".xml",".zip",".ico")):
        return False
    path = urlparse(low).path.rstrip("/")
    # Accept paths that look like posts: /category/..., /2024/.., /post-slug/, /news/slug, etc.
    segs = path.strip("/").split("/")
    if len(segs) >= 2:
        return True
    return False

def extract_text(soup: BeautifulSoup) -> str:
    for sel in ["article", "main", "[role='main']", ".entry-content", ".post-content", ".content"]:
        el = soup.select_one(sel)
        if el:
            return el.get_text(" ", strip=True)
    return soup.get_text(" ", strip=True)

def detect_tags(blob: str) -> str:
    tags = []
    if re.search(r"\bdepress", blob, re.I): tags.append("depression")
    if re.search(r"\banxiet|panic|phobi", blob, re.I): tags.append("anxiety")
    return ", ".join(sorted(set(tags))) or ""

YOUTH_RE = re.compile(r"youth|teen|adolescent|student|college|young adult|young people|school|parent", re.I)
def detect_audience(blob: str) -> str:
    return "youth" if YOUTH_RE.search(blob) else "general public"

def polite_sleep(s=0.3):
    time.sleep(s)


In [None]:
def discover_via_sitemap(base=BASE, limit_maps=10):
    urls = []
    try:
        html, ctype, _ = fetch(base + "/sitemap.xml", HEADERS_XML)
        if "xml" not in ctype:
            print("[sitemap] Not XML, skipping.")
            return []
        idx = soupify(html, "xml")
        # gather child sitemap xmls
        child_maps = [loc.get_text(strip=True) for loc in idx.find_all("loc")]
        child_maps = [u for u in child_maps if u.lower().endswith(".xml")]
        print("[sitemap] child maps:", len(child_maps))
        # optional trim for speed
        child_maps = child_maps[:limit_maps]
        for sm in child_maps:
            try:
                sm_html, sm_ctype, _ = fetch(sm, HEADERS_XML)
                if "xml" not in sm_ctype: 
                    continue
                sm_soup = soupify(sm_html, "xml")
                for loc in sm_soup.find_all("loc"):
                    u = loc.get_text(strip=True)
                    if looks_like_article(u):
                        urls.append(u)
                polite_sleep(0.15)
            except Exception as e:
                print("[sitemap-skip]", sm, "->", e)
                continue
    except Exception as e:
        print("[sitemap] failed:", e)
    # dedupe
    seen, deduped = set(), []
    for u in urls:
        if u not in seen:
            seen.add(u); deduped.append(u)
    return deduped

def discover_via_html(start=BASE, max_pages=120):
    """Breadth-first crawl from homepage; stop after collecting enough pages."""
    queue = [start]
    seen = set(queue)
    found = []
    while queue and len(seen) < max_pages:
        cur = queue.pop(0)
        try:
            html, ctype, resp = fetch(cur, HEADERS_HTML)
            if "html" not in ctype: 
                continue
            soup = soupify(html, "html")
        except Exception as e:
            print("[crawl-skip]", cur, "->", e); 
            continue
        # collect article-like
        for a in soup.select("a[href]"):
            u = norm(a.get("href"), resp.url)
            if not same_site(u): 
                continue
            if u not in seen:
                seen.add(u)
                # push to queue a limited set of internal pages to keep discovery going
                if len(queue) < max_pages:
                    queue.append(u)
                if looks_like_article(u):
                    found.append(u)
        polite_sleep(0.1)
    # dedupe
    seen2, deduped = set(), []
    for u in found:
        if u not in seen2:
            seen2.add(u); deduped.append(u)
    return deduped

# Run discovery
candidates = discover_via_sitemap(limit_maps=12)
print("Sitemap candidates:", len(candidates))

if len(candidates) == 0:
    print("[fallback] Using HTML crawl.")
    candidates = discover_via_html(max_pages=150)

print("Total candidates discovered:", len(candidates))
for u in itertools.islice(candidates, 0, 20):
    print(" -", u)

# For quick testing:
# candidates = candidates[:50]


[fetch] 404 https://mmha.org.my/sitemap.xml (text/html; charset=utf-8)
[sitemap] failed: 404 Client Error: Not Found for url: https://mmha.org.my/sitemap.xml
Sitemap candidates: 0
[fallback] Using HTML crawl.
[fetch] 200 https://mmha.org.my/ (text/html; charset=utf-8)
[fetch] 200 https://mmha.org.my/contact-us (text/html; charset=utf-8)
[fetch] 200 https://mmha.org.my/donate-now (text/html; charset=utf-8)
[fetch] 200 https://mmha.org.my/ (text/html; charset=utf-8)
[fetch] 200 https://mmha.org.my/find-help (text/html; charset=utf-8)
[fetch] 200 https://mmha.org.my/get-involved (text/html; charset=utf-8)
[fetch] 200 https://mmha.org.my/education-training/mhfa-info (text/html; charset=utf-8)
[fetch] 200 https://mmha.org.my/education-training/mhfa-courses (text/html; charset=utf-8)
[fetch] 200 https://mmha.org.my/education-training/caregiver-programme-cse (text/html; charset=utf-8)
[fetch] 200 https://mmha.org.my/media-resources (text/html; charset=utf-8)
[fetch] 200 https://mmha.org.my/ab

In [None]:
def parse_article(url: str) -> dict | None:
    try:
        html, ctype, _ = fetch(url, HEADERS_HTML)
        if "html" not in ctype:
            return None
        soup = soupify(html, "html")

        title_el = soup.find("h1") or soup.find("title")
        title = title_el.get_text(strip=True) if title_el else ""

        desc_el = (soup.find("meta", attrs={"name": "description"})
                   or soup.find("meta", attrs={"property": "og:description"}))
        description = (desc_el.get("content") or "").strip() if desc_el else ""

        text = extract_text(soup)
        snippet = " ".join(text.split()[:1200])
        blob = " ".join([url, title, description, snippet])

        return {
            "URL": url,
            "Title": title,
            "Description": description,
            "Tags": detect_tags(blob),
            "Tone": "supportive",
            "Audience": detect_audience(blob),
            "Source": "MMHA Malaysia",
            "Locality (yes or no)": "yes",
            "Country": "Malaysia",
            "Relevance": "",   # leave empty
        }
    except Exception as e:
        print("[skip]", url, "->", e)
        return None

rows = []
for i, u in enumerate(candidates):
    if i and i % 20 == 0:
        polite_sleep(1.0)
    polite_sleep(0.25)
    rec = parse_article(u)
    if rec:
        rows.append(rec)

df_mmha = pd.DataFrame(rows, columns=[
    "URL","Title","Description","Tags","Tone","Audience",
    "Source","Locality (yes or no)","Country","Relevance"
])
df_mmha.insert(0, "ID", range(1, len(df_mmha) + 1))

print("Rows collected:", len(df_mmha))
df_mmha.head(10)
