In [31]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import time, random

In [33]:
base_url = "https://www.who.int/health-topics/mental-health"
headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64)"}

r = requests.get(base_url, headers=headers)
soup = BeautifulSoup(r.text, "lxml")

# Collect related links that lead to static WHO pages
related_links = []
for a in soup.find_all("a", href=True):
    href = a["href"]
    if any(keyword in href for keyword in [
        "mental", "depression", "anxiety", "stress", "suicide", "wellbeing", "substance"
    ]) and ("who.int" in href or href.startswith("/")):
        full_url = "https://www.who.int" + href if href.startswith("/") else href
        if "pdf" not in full_url:
            related_links.append(full_url)

related_links = list(set(related_links))
print("✅ Total related links found:", len(related_links))
print(related_links[:10])

✅ Total related links found: 49
['https://www.who.int/news-room/fact-sheets/detail/mental-health-at-work', 'https://www.who.int/multi-media/details/estonia---who-s-mental-health-and-psychosocial-support-global-simulation-exercise', 'https://www.who.int/campaigns/world-mental-health-day/2025', 'https://www.who.int/groups/strategic-and-technical-advisory-group-on-mental-health--brain-health-and-substance-use-(stag-mns)', 'https://www.who.int/news-room/fact-sheets/detail/depression', 'https://www.who.int/news-room/fact-sheets/detail/mental-health-of-older-adults', 'https://www.who.int/publications/i/item/special-initiative-for-mental-health-(2019-2023)', 'https://www.who.int/news-room/feature-stories/detail/mental-health-in-emergencies-feature-profile--ms-rajiah-abu-sway--mental-health-lead--who-in-the-occupied-palestinian-territory-(opt)', 'https://www.who.int/multi-media/details/ethiopia---emergencies-preparedness--mental-health-and-psychosocial-support', 'https://www.who.int/multi-medi

In [38]:
related_links = [
    link for link in related_links
    if any(x in link for x in ["/fact-sheets/", "/health-topics/"])
]
print("Filtered WHO links:", len(related_links))

Filtered WHO links: 14


In [40]:
def scrape_who_article(url):
    try:
        r = requests.get(url, headers=headers, timeout=20)
        soup = BeautifulSoup(r.text, "lxml")

        title_tag = soup.find("h1") or soup.find("h2")
        title = title_tag.get_text(strip=True) if title_tag else "No title found"

        # broaden search
        content_div = (
            soup.find("div", class_="sf-detail-body-wrapper")
            or soup.find("div", class_="main-content")
            or soup.find("article")
            or soup.find("section", class_="content")
            or soup.find("div", {"id": "PageContent_C008_Col01"})
        )

        if content_div:
            paragraphs = [p.get_text(strip=True) for p in content_div.find_all("p")]
            description = " ".join(paragraphs)
        else:
            description = "No content found"

        return {"URL": url, "Title": title, "Description": description}
    except Exception as e:
        print(f"⚠️ Error scraping {url}: {e}")
        return None

In [44]:
articles = []

for i, link in enumerate(related_links[:100]):  # limit to 100 for safety
    data = scrape_who_article(link)
    if data and len(data["Description"].strip()) > 30:
        articles.append(data)
        print(f"{i+1}/{len(related_links)} ✅ {data['Title'][:60]}")
    else:
        print(f"{i+1}/{len(related_links)} ⚠️ Skipped ({data['Description'][:50] if data else 'no response'})")
    time.sleep(random.uniform(2, 5))  # polite delay

1/14 ✅ Mental health at work
2/14 ✅ Depressive disorder (depression)
3/14 ✅ Mental health of older adults
4/14 ✅ Depression
5/14 ✅ Mental disorders
6/14 ✅ Anxiety disorders
7/14 ✅ Suicide prevention
8/14 ✅ Mental disorders
9/14 ✅ Mental health in emergencies
10/14 ✅ Post-traumatic stress disorder
11/14 ✅ Refugee and migrant mental health
12/14 ✅ Mental health
13/14 ✅ Suicide
14/14 ✅ Mental health of adolescents


In [46]:
df = pd.DataFrame(articles)
df.insert(0, "ID", range(1, len(df) + 1))

# Add standard columns for your FYP schema
df["Tags"] = ""
df["Tone"] = ""
df["Audience"] = ""
df["Source"] = "World Health Organization (WHO)"
df["Locality (yes or no)"] = "no"
df["Country"] = "International"
df["Relevance"] = ""

print("✅ Total WHO articles scraped:", len(df))
df.head()

✅ Total WHO articles scraped: 14


Unnamed: 0,ID,URL,Title,Description,Tags,Tone,Audience,Source,Locality (yes or no),Country,Relevance
0,1,https://www.who.int/news-room/fact-sheets/deta...,Mental health at work,Almost 60% of the world population is in work ...,,,,World Health Organization (WHO),no,International,
1,2,https://www.who.int/news-room/fact-sheets/deta...,Depressive disorder (depression),Overview Depressive disorder (also known as de...,,,,World Health Organization (WHO),no,International,
2,3,https://www.who.int/news-room/fact-sheets/deta...,Mental health of older adults,The world’s population is ageing fast. In 2023...,,,,World Health Organization (WHO),no,International,
3,4,https://www.who.int/health-topics/depression,Depression,"Depressive disorder, or depression, is a commo...",,,,World Health Organization (WHO),no,International,
4,5,https://www.who.int/news-room/fact-sheets/deta...,Mental disorders,A mental disorder is characterized by a clini...,,,,World Health Organization (WHO),no,International,


In [48]:
df.to_csv("WHO_articles_raw.csv", index=False, encoding="utf-8")
print("✅ WHO dataset saved as WHO_articles_raw.csv")

✅ WHO dataset saved as WHO_articles_raw.csv
