In [19]:
# --- 1. Import necessary libraries ---
import requests
from bs4 import BeautifulSoup
import pandas as pd
import time, random, re

In [20]:
# --- 2. Define sitemap URL for the source ---
sitemap_url = "https://www.mentalhealth.org.uk/sitemap.xml"

In [21]:
# --- 3. Parse sitemap to extract all URLs ---
response = requests.get(sitemap_url)
soup = BeautifulSoup(response.text, "xml")
urls = [loc.get_text() for loc in soup.find_all("loc")]

print("Total URLs found:", len(urls))

Total URLs found: 1499


In [36]:
print(len(urls))
print(urls[:20])

1499
['https://www.mentalhealth.org.uk/', 'https://www.mentalhealth.org.uk/cy', 'https://www.mentalhealth.org.uk/explore-mental-health/blogs/tackling-digital-exclusion-older-people', 'https://www.mentalhealth.org.uk/explore-mental-health/blogs/mental-health-advice-older-people-during-coronavirus-outbreak', 'https://www.mentalhealth.org.uk/explore-mental-health/blogs/focus-mental-health-minority-men', 'https://www.mentalhealth.org.uk/explore-mental-health/blogs/mental-health-research-whats-it-us', 'https://www.mentalhealth.org.uk/explore-mental-health/blogs/nhs-long-term-plan-progress-and-way-forward', 'https://www.mentalhealth.org.uk/explore-mental-health/blogs/70-years-do-we-understand-prevention', 'https://www.mentalhealth.org.uk/explore-mental-health/blogs/reclaiming-our-heritage', 'https://www.mentalhealth.org.uk/explore-mental-health/blogs/refugees-behind-every-statistic-there-person-and-life-matters', 'https://www.mentalhealth.org.uk/explore-mental-health/blogs/excellence-youth-w

In [46]:
# --- 4. Filter article URLs ---
# Adjust the pattern according to the target website structure
article_urls = [u for u in urls if "depression" in u or "anxiety" in u]
print("Filtered article URLs:", len(article_urls))
print(article_urls[:10])

Filtered article URLs: 44
['https://www.mentalhealth.org.uk/explore-mental-health/blogs/i-feel-frightened-angry-world-outside-my-door-impact-brexit-anxiety', 'https://www.mentalhealth.org.uk/explore-mental-health/blogs/why-anxiety-theme-mental-health-awareness-week-2023', 'https://www.mentalhealth.org.uk/explore-mental-health/blogs/financial-distress-and-anxiety-during-holidays', 'https://www.mentalhealth.org.uk/explore-mental-health/a-z-topics/anxiety', 'https://www.mentalhealth.org.uk/explore-mental-health/a-z-topics/depression', 'https://www.mentalhealth.org.uk/explore-mental-health/a-z-topics/postnatal-depression', 'https://www.mentalhealth.org.uk/explore-mental-health/statistics/anxiety-statistics', 'https://www.mentalhealth.org.uk/explore-mental-health/statistics/depression-statistics', 'https://www.mentalhealth.org.uk/explore-mental-health/stories/gabis-story-struggle-speaking-about-my-depression-and-loneliness', 'https://www.mentalhealth.org.uk/explore-mental-health/stories/tal

In [48]:
# --- 5. Define scraping function for one article ---
headers = {"User-Agent": "FYPResearchBot/1.0 (+mailto:youremail@example.com)"}

def scrape_article(url):
    """
    Fetch an article page, extract title and description.
    Modify CSS selectors based on the site's HTML structure.
    """
    try:
        r = requests.get(url, headers=headers, timeout=10)
        soup = BeautifulSoup(r.text, "lxml")
        
        # Extract the title (adjust tag and class)
        title = soup.find("h1").get_text(strip=True) if soup.find("h1") else "No title"
        
        # Extract description (paragraphs combined)
        paragraphs = [p.get_text(strip=True) for p in soup.find_all("p")]
        description = " ".join(paragraphs)
        
        # Return article dictionary
        return {
            "URL": url,
            "Title": title,
            "Description": description
        }
    except Exception as e:
        print("‚ùå Error scraping:", url, e)
        return None

In [50]:
# --- 6. Loop through URLs and collect data ---
articles = []

for i, url in enumerate(article_urls[:200]):  # Limit for testing
    data = scrape_article(url)
    if data:
        articles.append(data)
        print(f"{i+1}/{len(article_urls)} ‚úÖ {data['Title'][:60]}")
    time.sleep(random.uniform(2, 5))  # Polite delay

print("‚úÖ Scraping complete. Total collected:", len(articles))

1/44 ‚úÖ ‚ÄúI feel frightened of the angry world outside my door‚Äù: the 
2/44 ‚úÖ Why anxiety is the theme for Mental Health Awareness Week 20
3/44 ‚úÖ Financial distress and anxiety during the holidays
4/44 ‚úÖ Anxiety
5/44 ‚úÖ Depression
6/44 ‚úÖ Postnatal depression
7/44 ‚úÖ Anxiety: statistics
8/44 ‚úÖ Depression: statistics
9/44 ‚úÖ My struggle with speaking up about my depression
10/44 ‚úÖ Loneliness and anxiety as a young mum
11/44 ‚úÖ How to manage fear and anxiety
12/44 ‚úÖ Living with Anxiety report
13/44 ‚úÖ All About Depression booklet
14/44 ‚úÖ Overcoming my fear and anxiety with a skydive
15/44 ‚úÖ Binge eating, depression and obsession with fitness
16/44 ‚úÖ Coping with feelings of anxiety
17/44 ‚úÖ Anxiety and how to tackle it
18/44 ‚úÖ What causes anxiety?
19/44 ‚úÖ Who has anxiety in the UK?
20/44 ‚úÖ What is anxiety?
21/44 ‚úÖ Stories about anxiety
22/44 ‚úÖ What does anxiety feel like?
23/44 ‚úÖ Coping with anxiety
24/44 ‚úÖ Seeking support for anxiety
25/44 ‚úÖ Wh

In [52]:
# --- 7. Convert to DataFrame ---
df = pd.DataFrame(articles)

In [54]:
# --- 8. Add additional columns (empty placeholders for now) ---
df.insert(0, "ID", range(1, len(df) + 1))   # Auto ID
df["Tags"] = ""
df["Tone"] = ""
df["Audience"] = ""
df["Source"] = "Example Source Name"
df["Locality (yes or no)"] = ""  # Whether source is local (e.g., Malaysian)
df["Country"] = "Example Country"
df["Relevance"] = ""

In [56]:
# --- 9. Save to CSV for this source ---
df.to_csv("MFH_articles_raw.csv", index=False)
print("üíæ Data saved to example_articles_raw.csv")

üíæ Data saved to example_articles_raw.csv
