In [1]:
%pip install --quiet requests beautifulsoup4 lxml pandas

import requests, time, re, itertools
from urllib.parse import urljoin, urldefrag, urlparse
from bs4 import BeautifulSoup
import pandas as pd


Note: you may need to restart the kernel to use updated packages.


In [3]:
BASE = "https://miasa.org.my"

# Valid hubs that list articles/resources. Add more if you find them.
SEEDS = [
    "https://miasa.org.my/blog/",
    "https://miasa.org.my/resources/",
]

HEADERS = {
    "User-Agent": ("Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
                   "AppleWebKit/537.36 (KHTML, like Gecko) "
                   "Chrome/120.0.0.0 Safari/537.36"),
    "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
    "Accept-Language": "en-US,en;q=0.9",
}

def fetch_html(url, *, headers=HEADERS, timeout=30):
    r = requests.get(url, headers=headers, timeout=timeout, allow_redirects=True)
    ctype = (r.headers.get("Content-Type") or "").lower()
    print(f"[fetch] {r.status_code}  {r.url}  ({ctype})")
    r.raise_for_status()
    if "html" not in ctype:
        raise ValueError(f"Not HTML: {url} -> {ctype}")
    return r.text, r

def soupify(html):
    for p in ("lxml", "html.parser"):
        try:
            return BeautifulSoup(html, p)
        except Exception:
            pass
    return BeautifulSoup(html, "html.parser")

def norm_url(href, base):
    if not href:
        return None
    u = urljoin(base, href)
    u, _ = urldefrag(u)
    return u

def same_site(u: str) -> bool:
    return u.lower().startswith(BASE)

def is_article_like(u: str) -> bool:
    """
    MIASA articles commonly live under /blog/… and /resources/…
    Require at least 1 slug after those prefixes to avoid hub pages.
    """
    u = u.lower()
    if not same_site(u):
        return False
    if any(u.endswith(ext) for ext in (".pdf",".jpg",".jpeg",".png",".gif",".svg",".webp",".avif",".mp4",".json",".xml",".zip")):
        return False
    path = urlparse(u).path.rstrip("/")
    if path.startswith("/blog/") or path.startswith("/resources/"):
        segs = path.strip("/").split("/")
        return len(segs) >= 2  # e.g., /blog/<slug>
    return False

def extract_main_text(soup: BeautifulSoup) -> str:
    # Try semantic containers first
    for sel in ["article", "main", "[role='main']", ".entry-content", ".post-content", ".content"]:
        el = soup.select_one(sel)
        if el:
            return el.get_text(" ", strip=True)
    return soup.get_text(" ", strip=True)

def polite_sleep(sec=0.35):
    time.sleep(sec)

# Simple tag detector for your "Tags" column
def detect_tags(text_or_meta: str) -> str:
    tags = []
    if re.search(r"\bdepress", text_or_meta, flags=re.I):
        tags.append("depression")
    if re.search(r"\banxiet|panic|phobi", text_or_meta, flags=re.I):
        tags.append("anxiety")
    return ", ".join(sorted(set(tags))) or ""

# Optional: detect youth-vs-public in copy (fallback to "general public")
YOUTH_RE = re.compile(r"youth|teen|adolescent|student|college|young adult|young people|school|parent", re.I)
def detect_audience(blob: str) -> str:
    return "youth" if YOUTH_RE.search(blob) else "general public"


In [5]:
def discover_from_seed(seed: str, max_pages: int = 8):
    """
    Crawl a hub page and a few 'page/N' links if present (common on WP sites).
    Returns list of candidate article URLs.
    """
    found, seen = [], set()

    # base page
    try:
        html, resp = fetch_html(seed)
        soup = soupify(html)
    except Exception as e:
        print(f"[skip seed] {seed} -> {e}")
        return found

    # Try to infer pagination pattern: /page/2/, /page/3/, etc.
    pagers = set()
    for a in soup.select("a[href]"):
        u = norm_url(a.get("href"), resp.url)
        if not u or not same_site(u):
            continue
        if re.search(r"/page/\d+/?$", urlparse(u).path, flags=re.I):
            pagers.add(u)

    # Include the seed itself first
    pages = [resp.url]
    # Add a few numeric pages if any discovered
    if pagers:
        # Sort numerically and cap
        def page_num(u):
            m = re.search(r"/page/(\d+)/?$", urlparse(u).path)
            return int(m.group(1)) if m else 999999
        pages += [u for u in sorted(pagers, key=page_num)[:max_pages-1]]

    # Visit each hub page and collect article-like links
    for pg in pages:
        try:
            html, r2 = fetch_html(pg)
            sp = soupify(html)
        except Exception as e:
            print(f"[skip hub] {pg} -> {e}")
            continue
        for a in sp.select("a[href]"):
            u = norm_url(a.get("href"), r2.url)
            if not u or not same_site(u):
                continue
            if is_article_like(u) and u not in seen:
                seen.add(u)
                found.append(u)

    return found

# Discover across all seeds
candidates, global_seen = [], set()
for seed in SEEDS:
    urls = discover_from_seed(seed, max_pages=8)
    for u in urls:
        if u not in global_seen:
            global_seen.add(u)
            candidates.append(u)

print("Total candidates discovered:", len(candidates))
for u in itertools.islice(candidates, 0, 20):
    print(" -", u)

# Speed up dev while testing:
# candidates = candidates[:60]


[fetch] 200  https://miasa.org.my/blog/  (text/html; charset=utf-8)
[fetch] 200  https://miasa.org.my/blog/  (text/html; charset=utf-8)
[fetch] 200  https://miasa.org.my/blog/page/2/  (text/html; charset=utf-8)
[fetch] 200  https://miasa.org.my/blog/page/3/  (text/html; charset=utf-8)
[fetch] 200  https://miasa.org.my/blog/page/10/  (text/html; charset=utf-8)
[fetch] 404  https://miasa.org.my/resources/  (text/html; charset=utf-8)
[skip seed] https://miasa.org.my/resources/ -> 404 Client Error: Not Found for url: https://miasa.org.my/resources/
Total candidates discovered: 7
 - https://miasa.org.my/blog/page/2/
 - https://miasa.org.my/blog/page/3/
 - https://miasa.org.my/blog/page/10/
 - https://miasa.org.my/blog/page/4/
 - https://miasa.org.my/blog/page/5/
 - https://miasa.org.my/blog/page/9/
 - https://miasa.org.my/blog/page/8/


In [7]:
def parse_article(url: str) -> dict | None:
    try:
        html, _ = fetch_html(url)
        soup = soupify(html)

        title_el = soup.find("h1") or soup.find("title")
        title = title_el.get_text(strip=True) if title_el else ""

        desc_el = (soup.find("meta", attrs={"name": "description"})
                   or soup.find("meta", attrs={"property": "og:description"}))
        description = (desc_el.get("content") or "").strip() if desc_el else ""

        text = extract_main_text(soup)
        snippet = " ".join(text.split()[:1200])
        blob = " ".join([url, title, description, snippet])

        row = {
            "URL": url,
            "Title": title,
            "Description": description,
            "Tags": detect_tags(blob),
            "Tone": "supportive",                 # NGO education/support
            "Audience": detect_audience(blob),    # youth vs general public
            "Source": "MIASA Malaysia",
            "Locality (yes or no)": "yes",        # local NGO
            "Country": "Malaysia",
            "Relevance": "",                      # leave empty per your request
        }
        return row
    except Exception as e:
        print("[skip]", url, "->", e)
        return None

rows = []
for i, u in enumerate(candidates):
    if i and i % 20 == 0:
        polite_sleep(1.0)
    polite_sleep(0.3)
    rec = parse_article(u)
    if rec:
        rows.append(rec)

df_miasa = pd.DataFrame(rows, columns=[
    "URL","Title","Description","Tags","Tone","Audience",
    "Source","Locality (yes or no)","Country","Relevance"
])

# Add ID as first column (1..N)
df_miasa.insert(0, "ID", range(1, len(df_miasa) + 1))

print("Rows collected:", len(df_miasa))
df_miasa.head(10)


[fetch] 200  https://miasa.org.my/blog/page/2/  (text/html; charset=utf-8)
[fetch] 200  https://miasa.org.my/blog/page/3/  (text/html; charset=utf-8)
[fetch] 200  https://miasa.org.my/blog/page/10/  (text/html; charset=utf-8)
[fetch] 200  https://miasa.org.my/blog/page/4/  (text/html; charset=utf-8)
[fetch] 200  https://miasa.org.my/blog/page/5/  (text/html; charset=utf-8)
[fetch] 200  https://miasa.org.my/blog/page/9/  (text/html; charset=utf-8)
[fetch] 200  https://miasa.org.my/blog/page/8/  (text/html; charset=utf-8)
Rows collected: 7


Unnamed: 0,ID,URL,Title,Description,Tags,Tone,Audience,Source,Locality (yes or no),Country,Relevance
0,1,https://miasa.org.my/blog/page/2/,Blog - Mental Illness Awareness & Support Asso...,,,supportive,youth,MIASA Malaysia,yes,Malaysia,
1,2,https://miasa.org.my/blog/page/3/,Blog - Mental Illness Awareness & Support Asso...,,,supportive,general public,MIASA Malaysia,yes,Malaysia,
2,3,https://miasa.org.my/blog/page/10/,Blog - Mental Illness Awareness & Support Asso...,,depression,supportive,general public,MIASA Malaysia,yes,Malaysia,
3,4,https://miasa.org.my/blog/page/4/,Blog - Mental Illness Awareness & Support Asso...,,depression,supportive,general public,MIASA Malaysia,yes,Malaysia,
4,5,https://miasa.org.my/blog/page/5/,Blog - Mental Illness Awareness & Support Asso...,,,supportive,general public,MIASA Malaysia,yes,Malaysia,
5,6,https://miasa.org.my/blog/page/9/,Blog - Mental Illness Awareness & Support Asso...,,anxiety,supportive,general public,MIASA Malaysia,yes,Malaysia,
6,7,https://miasa.org.my/blog/page/8/,Blog - Mental Illness Awareness & Support Asso...,,depression,supportive,youth,MIASA Malaysia,yes,Malaysia,


In [9]:
out_csv = "MIASA_articles_raw.csv"
df_miasa.to_csv(out_csv, index=False)
print("Saved:", out_csv)


Saved: MIASA_articles_raw.csv
