In [9]:
%pip install --quiet requests beautifulsoup4 lxml pandas

import requests, time, re, itertools
from urllib.parse import urljoin, urldefrag, urlparse
from bs4 import BeautifulSoup
import pandas as pd


Note: you may need to restart the kernel to use updated packages.


In [11]:
BASE   = "https://www.nimh.nih.gov"
SEEDS  = [
    "https://www.nimh.nih.gov/health/topics/depression",
    "https://www.nimh.nih.gov/health/topics/anxiety-disorders",
]

HEADERS = {
    "User-Agent": (
        "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
        "AppleWebKit/537.36 (KHTML, like Gecko) "
        "Chrome/120.0.0.0 Safari/537.36"
    ),
    "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
    "Accept-Language": "en-US,en;q=0.9",
}

def fetch_html(url, *, headers=HEADERS, timeout=30):
    r = requests.get(url, headers=headers, timeout=timeout, allow_redirects=True)
    ctype = (r.headers.get("Content-Type") or "").lower()
    print(f"[fetch] {r.status_code}  {r.url}  ({ctype})")
    r.raise_for_status()
    if "html" not in ctype:
        raise ValueError(f"Not HTML: {url} -> {ctype}")
    return r.text, r

def soupify(html):
    for p in ("lxml", "html.parser"):
        try:
            return BeautifulSoup(html, p)
        except Exception:
            pass
    return BeautifulSoup(html, "html.parser")

def norm_url(href, base):
    u = urljoin(base, href)
    u, _ = urldefrag(u)
    return u

def is_article_like(u: str) -> bool:
    """Keep NIMH health content; skip assets."""
    u = u.lower()
    if not u.startswith(BASE):
        return False
    if any(u.endswith(ext) for ext in (".pdf",".jpg",".jpeg",".png",".gif",".svg",".webp",".avif",".mp4",".json",".xml")):
        return False
    return "/health/" in u

def extract_main_text(soup: BeautifulSoup) -> str:
    for sel in ["main", "[role='main']", "article"]:
        el = soup.select_one(sel)
        if el:
            return el.get_text(" ", strip=True)
    return soup.get_text(" ", strip=True)

def extract_date(soup: BeautifulSoup) -> str | None:
    for attrs in [
        {"name": "date"},
        {"name": "last-modified"},
        {"property": "article:published_time"},
        {"property": "article:modified_time"},
        {"itemprop": "datePublished"},
    ]:
        m = soup.find("meta", attrs=attrs)
        if m and m.get("content"):
            return m["content"].strip()
    # Fallback: look for visible date patterns (very light)
    time_el = soup.find("time")
    if time_el and time_el.get_text(strip=True):
        return time_el.get_text(strip=True)
    return None

def polite_sleep(sec=0.35):
    time.sleep(sec)


In [13]:
candidates, seen = [], set()

for seed in SEEDS:
    html, resp = fetch_html(seed)
    soup = soupify(html)
    anchors = soup.select("a[href]")
    print("[parse] anchors:", len(anchors))
    for a in anchors:
        href = a.get("href")
        if not href:
            continue
        u = norm_url(href, resp.url)
        if is_article_like(u) and u not in seen:
            seen.add(u)
            candidates.append(u)

print("Candidate URLs (first 20):")
for u in itertools.islice(candidates, 0, 20):
    print(" -", u)
print("Total candidates:", len(candidates))


[fetch] 200  https://www.nimh.nih.gov/health/topics/depression  (text/html; charset=utf-8)
[parse] anchors: 171
[fetch] 200  https://www.nimh.nih.gov/health/topics/anxiety-disorders  (text/html; charset=utf-8)
[parse] anchors: 172
Candidate URLs (first 20):
 - https://www.nimh.nih.gov/health/topics/depression
 - https://www.nimh.nih.gov/health/find-help
 - https://www.nimh.nih.gov/health/topics/espanol
 - https://www.nimh.nih.gov/health/topics
 - https://www.nimh.nih.gov/health/statistics
 - https://www.nimh.nih.gov/health/publications
 - https://www.nimh.nih.gov/health/trials
 - https://www.nimh.nih.gov/health/topics/anxiety-disorders
 - https://www.nimh.nih.gov/health/topics/attention-deficit-hyperactivity-disorder-adhd
 - https://www.nimh.nih.gov/health/topics/autism-spectrum-disorders-asd
 - https://www.nimh.nih.gov/health/topics/bipolar-disorder
 - https://www.nimh.nih.gov/health/topics/borderline-personality-disorder
 - https://www.nimh.nih.gov/health/topics/covid-19-and-mental-h

In [15]:
# Topic & audience vocab for scoring/tags
TOPIC_TERMS = {
    r"\bdepression\b": 3,
    r"\bdepressive\b": 2,
    r"\bmajor depressive\b": 3,
    r"\banxiety\b": 3,
    r"\banxious\b": 2,
    r"\bgeneralized anxiety\b": 3,
    r"\bgad\b": 2,
    r"\bpanic (disorder|attack)s?\b": 2,
    r"\bphobi(as)?\b": 1,
}

AUDIENCE_TERMS = {
    r"\bgeneral public\b": 2,
    r"\bfor the public\b": 2,
    r"\bpublic\b": 1,
    r"\byouth\b": 2,
    r"\bteen(ager)?s?\b": 2,
    r"\badolescent(s)?\b": 2,
    r"\bchildren\b": 2,
    r"\bkids?\b": 1,
    r"\bstudent(s)?\b": 1,
    r"\bcollege\b": 1,
    r"\byoung (people|adults)\b": 2,
    r"\bparents?\b": 1,
}

def score_text(text: str, patterns: dict) -> int:
    s = 0
    for pat, w in patterns.items():
        if re.search(pat, text, flags=re.I):
            s += w
    return s

def detect_tags(blob: str) -> str:
    tags = []
    if re.search(r"\bdepress", blob, flags=re.I):
        tags.append("depression")
    if re.search(r"\banxiety|panic|phobi", blob, flags=re.I):
        tags.append("anxiety")
    return ", ".join(sorted(set(tags))) or ""

def detect_audience(blob: str) -> str:
    # If youth signals present, label youth; else general public
    if re.search(r"youth|teen|adolescent|children|kids|student|college|young (people|adults)|parents?", blob, flags=re.I):
        return "youth"
    return "general public"

def parse_article(url: str) -> dict | None:
    try:
        html, _ = fetch_html(url)
        soup = soupify(html)

        title_el = soup.find("h1") or soup.find("title")
        title = title_el.get_text(strip=True) if title_el else ""

        desc_el = soup.find("meta", attrs={"name": "description"}) or soup.find("meta", attrs={"property": "og:description"})
        description = (desc_el.get("content") or "").strip() if desc_el else ""

        date = extract_date(soup)
        text = extract_main_text(soup)
        snippet = " ".join(text.split()[:1200])  # keep it light

        blob = " ".join([url, title, description, snippet]).lower()
        topic_score    = score_text(blob, TOPIC_TERMS)
        audience_score = score_text(blob, AUDIENCE_TERMS)

        # URL path bonus for topic
        path = urlparse(url).path.lower()
        if any(k in path for k in ("depress","anxiety","panic")):
            topic_score += 2

        relevance = topic_score + audience_score

        # Map to your schema
        tags     = detect_tags(blob)
        audience = detect_audience(blob)

        row = {
            "URL": url,
            "Title": title,
            "Description": description,
            "Tags": tags,
            "Tone": "informative",
            "Audience": audience,
            "Source": "NIMH",
            "Locality (yes or no)": "no",
            "Country": "United States",
            "Relevance": relevance,
        }
        return row
    except Exception as e:
        print("[skip]", url, "->", e)
        return None

rows = []
for i, u in enumerate(candidates):
    if i and i % 20 == 0:
        polite_sleep(1.0)
    polite_sleep(0.3)
    rec = parse_article(u)
    if rec and rec["Relevance"] > 0:  # your requirement
        rows.append(rec)

df = pd.DataFrame(rows, columns=[
    "URL","Title","Description","Tags","Tone","Audience",
    "Source","Locality (yes or no)","Country","Relevance"
])

# Add ID as first column (1..N)
df.insert(0, "ID", range(1, len(df) + 1))

print("Rows kept:", len(df))
df.head(10)


[fetch] 200  https://www.nimh.nih.gov/health/topics/depression  (text/html; charset=utf-8)
[fetch] 200  https://www.nimh.nih.gov/health/find-help  (text/html; charset=utf-8)
[fetch] 200  https://www.nimh.nih.gov/health/topics/espanol  (text/html; charset=utf-8)
[fetch] 200  https://www.nimh.nih.gov/health/topics  (text/html; charset=utf-8)
[fetch] 200  https://www.nimh.nih.gov/health/statistics  (text/html; charset=utf-8)
[fetch] 200  https://www.nimh.nih.gov/health/publications  (text/html; charset=utf-8)
[fetch] 200  https://www.nimh.nih.gov/health/trials  (text/html; charset=utf-8)
[fetch] 200  https://www.nimh.nih.gov/health/topics/anxiety-disorders  (text/html; charset=utf-8)
[fetch] 200  https://www.nimh.nih.gov/health/topics/attention-deficit-hyperactivity-disorder-adhd  (text/html; charset=utf-8)
[fetch] 200  https://www.nimh.nih.gov/health/topics/autism-spectrum-disorders-asd  (text/html; charset=utf-8)
[fetch] 200  https://www.nimh.nih.gov/health/topics/bipolar-disorder  (tex

Unnamed: 0,ID,URL,Title,Description,Tags,Tone,Audience,Source,Locality (yes or no),Country,Relevance
0,1,https://www.nimh.nih.gov/health/topics/depression,Depression,Learn about NIMH research on depression. Find ...,"anxiety, depression",informative,youth,NIMH,no,United States,17
1,2,https://www.nimh.nih.gov/health/find-help,Help for Mental Illnesses,If you or someone you know has a mental illnes...,,informative,youth,NIMH,no,United States,2
2,3,https://www.nimh.nih.gov/health/topics,Health Topics,"Learn more about mental disorders, treatments ...","anxiety, depression",informative,youth,NIMH,no,United States,10
3,4,https://www.nimh.nih.gov/health/statistics,Statistics,NIMH statistics pages include statistics on th...,"anxiety, depression",informative,general public,NIMH,no,United States,13
4,5,https://www.nimh.nih.gov/health/publications,Brochures and Fact Sheets,"Download, read, and order free NIMH brochures ...","anxiety, depression",informative,youth,NIMH,no,United States,18
5,6,https://www.nimh.nih.gov/health/trials,Clinical Trials – Information for Participants,"Learn about clinical trials, why people partic...",,informative,youth,NIMH,no,United States,2
6,7,https://www.nimh.nih.gov/health/topics/anxiety...,Anxiety Disorders,Learn about NIMH research on anxiety disorders...,"anxiety, depression",informative,youth,NIMH,no,United States,22
7,8,https://www.nimh.nih.gov/health/topics/attenti...,Attention-Deficit/Hyperactivity Disorder (ADHD),NIMH researches attention-deficit/hyperactivit...,"anxiety, depression",informative,youth,NIMH,no,United States,11
8,9,https://www.nimh.nih.gov/health/topics/autism-...,Autism Spectrum Disorder,Learn about NIMH research on autism spectrum d...,,informative,youth,NIMH,no,United States,5
9,10,https://www.nimh.nih.gov/health/topics/bipolar...,Bipolar Disorder,Learn about NIMH research on bipolar disorder....,"anxiety, depression",informative,youth,NIMH,no,United States,10


In [17]:
out_csv = "NIMH_articles_raw.csv"
df.to_csv(out_csv, index=False)
print("Saved:", out_csv)


Saved: NIMH_articles_raw.csv
