In [17]:
%pip install --quiet requests beautifulsoup4 lxml pandas

import requests, time, re, itertools
from urllib.parse import urljoin, urldefrag, urlparse
from bs4 import BeautifulSoup
import pandas as pd


Note: you may need to restart the kernel to use updated packages.


In [19]:
BASE = "https://au.reachout.com"

# Valid topic hubs + exemplar pages (NO 404s)
SEEDS = [
    "https://au.reachout.com/mental-health-issues",
    "https://au.reachout.com/mental-health-issues/anxiety",
    "https://au.reachout.com/mental-health-issues/depression",
    "https://au.reachout.com/mental-health-issues/anxiety/what-is-anxiety",
    "https://au.reachout.com/mental-health-issues/depression/depression",
]

HEADERS = {
    "User-Agent": ("Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
                   "AppleWebKit/537.36 (KHTML, like Gecko) "
                   "Chrome/120.0.0.0 Safari/537.36"),
    "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
    "Accept-Language": "en-US,en;q=0.9",
}

def fetch_html(url, *, headers=HEADERS, timeout=30):
    r = requests.get(url, headers=headers, timeout=timeout, allow_redirects=True)
    ctype = (r.headers.get("Content-Type") or "").lower()
    print(f"[fetch] {r.status_code}  {r.url}  ({ctype})")
    r.raise_for_status()
    if "html" not in ctype:
        raise ValueError(f"Not HTML: {url} -> {ctype}")
    return r.text, r

def soupify(html):
    for p in ("lxml", "html.parser"):
        try:
            return BeautifulSoup(html, p)
        except Exception:
            pass
    return BeautifulSoup(html, "html.parser")

def norm_url(href, base):
    u = urljoin(base, href)
    u, _ = urldefrag(u)
    return u

def is_article_like(u: str) -> bool:
    """
    ReachOut articles are primarily under /mental-health-issues/ and sometimes /articles/.
    Filter out assets and hub-only pages.
    """
    u = u.lower()
    if not u.startswith(BASE):
        return False
    if any(u.endswith(ext) for ext in (".pdf",".jpg",".jpeg",".png",".gif",".svg",".webp",".avif",".mp4",".json",".xml")):
        return False
    path = urlparse(u).path.rstrip("/")
    # Accept explicit articles route
    if "/articles/" in path:
        return True
    # Accept deeper content under /mental-health-issues/ (at least 2 segments after root)
    if path.startswith("/mental-health-issues/"):
        segs = path.strip("/").split("/")
        return len(segs) >= 3
    return False

def extract_main_text(soup: BeautifulSoup) -> str:
    for sel in ["main", "[role='main']", "article"]:
        el = soup.select_one(sel)
        if el:
            return el.get_text(" ", strip=True)
    return soup.get_text(" ", strip=True)

def polite_sleep(sec=0.35):
    time.sleep(sec)


In [21]:
candidates, seen = [], set()

for seed in SEEDS:
    try:
        html, resp = fetch_html(seed)
    except Exception as e:
        print(f"[skip seed] {seed} -> {e}")
        continue

    soup = soupify(html)
    anchors = soup.select("a[href]")
    print("[parse] anchors:", len(anchors))

    for a in anchors:
        href = a.get("href")
        if not href:
            continue
        u = norm_url(href, resp.url)
        if is_article_like(u) and u not in seen:
            seen.add(u)
            candidates.append(u)

print("Total candidates:", len(candidates))
for u in itertools.islice(candidates, 0, 20):
    print(" -", u)

# Speed up dev while testing:
# candidates = candidates[:60]


[fetch] 200  https://au.reachout.com/mental-health-issues  (text/html; charset=utf-8)
[parse] anchors: 61
[fetch] 200  https://au.reachout.com/mental-health-issues/anxiety  (text/html; charset=utf-8)
[parse] anchors: 75
[fetch] 200  https://au.reachout.com/mental-health-issues/depression  (text/html; charset=utf-8)
[parse] anchors: 67
[fetch] 200  https://au.reachout.com/mental-health-issues/anxiety/what-is-anxiety  (text/html; charset=utf-8)
[parse] anchors: 91
[fetch] 200  https://au.reachout.com/mental-health-issues/depression/depression  (text/html; charset=utf-8)
[parse] anchors: 94
Total candidates: 34
 - https://au.reachout.com/mental-health-issues/anxiety/what-is-anxiety
 - https://au.reachout.com/mental-health-issues/anxiety/how-to-manage-your-anxiety-and-stress
 - https://au.reachout.com/mental-health-issues/anxiety/how-to-get-help-for-anxiety
 - https://au.reachout.com/mental-health-issues/depression/support-services-for-depression-and-anxiety
 - https://au.reachout.com/ment

In [23]:
def detect_tags(text_or_meta: str) -> str:
    tags = []
    if re.search(r"\bdepress", text_or_meta, flags=re.I):
        tags.append("depression")
    if re.search(r"\banxiet|panic|phobi", text_or_meta, flags=re.I):
        tags.append("anxiety")
    return ", ".join(sorted(set(tags))) or ""

def parse_article(url: str) -> dict | None:
    try:
        html, _ = fetch_html(url)
        soup = soupify(html)

        title_el = soup.find("h1") or soup.find("title")
        title = title_el.get_text(strip=True) if title_el else ""

        # Description via meta (prefer description, fallback og:description)
        desc_el = (soup.find("meta", attrs={"name": "description"})
                   or soup.find("meta", attrs={"property": "og:description"}))
        description = (desc_el.get("content") or "").strip() if desc_el else ""

        text = extract_main_text(soup)
        snippet = " ".join(text.split()[:1200])  # lightweight content slice

        blob = " ".join([url, title, description, snippet])

        row = {
            "URL": url,
            "Title": title,
            "Description": description,
            "Tags": detect_tags(blob),
            "Tone": "supportive",                 # ReachOut style
            "Audience": "youth",                  # Site target
            "Source": "ReachOut Australia",
            "Locality (yes or no)": "yes",        # AU-focused org
            "Country": "Australia",
            "Relevance": "",                      # ← leave empty as requested
        }
        return row
    except Exception as e:
        print("[skip]", url, "->", e)
        return None

rows = []
for i, u in enumerate(candidates):
    if i and i % 20 == 0:
        polite_sleep(1.0)
    polite_sleep(0.3)
    rec = parse_article(u)
    if rec:
        rows.append(rec)

df_reachout = pd.DataFrame(rows, columns=[
    "URL","Title","Description","Tags","Tone","Audience",
    "Source","Locality (yes or no)","Country","Relevance"
])

# Add ID as first column (1..N)
df_reachout.insert(0, "ID", range(1, len(df_reachout) + 1))

print("Rows collected:", len(df_reachout))
df_reachout.head(10)


[fetch] 200  https://au.reachout.com/mental-health-issues/anxiety/what-is-anxiety  (text/html; charset=utf-8)
[fetch] 200  https://au.reachout.com/mental-health-issues/anxiety/how-to-manage-your-anxiety-and-stress  (text/html; charset=utf-8)
[fetch] 200  https://au.reachout.com/mental-health-issues/anxiety/how-to-get-help-for-anxiety  (text/html; charset=utf-8)
[fetch] 200  https://au.reachout.com/mental-health-issues/depression/support-services-for-depression-and-anxiety  (text/html; charset=utf-8)
[fetch] 200  https://au.reachout.com/mental-health-issues/anxiety/how-to-cope-if-youre-feeling-anxious-about-going-to-school  (text/html; charset=utf-8)
[fetch] 200  https://au.reachout.com/mental-health-issues/anxiety/what-is-a-panic-attack  (text/html; charset=utf-8)
[fetch] 200  https://au.reachout.com/mental-health-issues/anxiety/how-to-help-a-friend-with-panic-or-anxiety  (text/html; charset=utf-8)
[fetch] 200  https://au.reachout.com/mental-health-issues/anxiety/social-anxiety-disorde

Unnamed: 0,ID,URL,Title,Description,Tags,Tone,Audience,Source,Locality (yes or no),Country,Relevance
0,1,https://au.reachout.com/mental-health-issues/a...,What is anxiety?,What is anxiety? Worried that an anxiety disor...,"anxiety, depression",supportive,youth,ReachOut Australia,yes,Australia,
1,2,https://au.reachout.com/mental-health-issues/a...,How to manage anxiety and stress,Knowing how to manage anxiety can be confusing...,anxiety,supportive,youth,ReachOut Australia,yes,Australia,
2,3,https://au.reachout.com/mental-health-issues/a...,How to get help for anxiety,Is anxiety having a negative impact on your li...,"anxiety, depression",supportive,youth,ReachOut Australia,yes,Australia,
3,4,https://au.reachout.com/mental-health-issues/d...,Anxiety and depression support services,"Dealing with anxiety or depression is hard, bu...","anxiety, depression",supportive,youth,ReachOut Australia,yes,Australia,
4,5,https://au.reachout.com/mental-health-issues/a...,School anxiety: how to cope if you're anxious ...,"From making friends and stressing about exams,...","anxiety, depression",supportive,youth,ReachOut Australia,yes,Australia,
5,6,https://au.reachout.com/mental-health-issues/a...,Everything you need to know about panic attacks,Have you been experiencing panic attacks? Find...,"anxiety, depression",supportive,youth,ReachOut Australia,yes,Australia,
6,7,https://au.reachout.com/mental-health-issues/a...,How to help someone having a panic attack,Panic attacks can be scary at any age. Knowing...,anxiety,supportive,youth,ReachOut Australia,yes,Australia,
7,8,https://au.reachout.com/mental-health-issues/a...,Ask a therapist: Understanding Social Anxiety ...,Worried that there’s more to your social anxie...,"anxiety, depression",supportive,youth,ReachOut Australia,yes,Australia,
8,9,https://au.reachout.com/mental-health-issues/a...,Georgia's story: What happens when you feel to...,This is Georgia’s story about how she coped wi...,"anxiety, depression",supportive,youth,ReachOut Australia,yes,Australia,
9,10,https://au.reachout.com/mental-health-issues/a...,Generalised anxiety disorder,Generalised anxiety disorder (GAD) occurs when...,anxiety,supportive,youth,ReachOut Australia,yes,Australia,


In [25]:
out_csv = "ReachOut_articles_raw.csv"
df_reachout.to_csv(out_csv, index=False)
print("Saved:", out_csv)


Saved: ReachOut_articles_raw.csv
