In [32]:
import pandas as pd

df = pd.read_csv("mental_health_articles.csv")
df.head()


Unnamed: 0,url,title,body,lang
0,https://www.mentalhealth.org.uk/explore-mental...,Tackling digital exclusion in older people,This yearâ€™s theme set by the UN is â€˜Digital Eq...,en
1,https://www.mentalhealth.org.uk/explore-mental...,Mental health advice for older people during t...,"This content mentions loneliness or isolation,...",en
2,https://www.mentalhealth.org.uk/explore-mental...,A focus on the mental health of minority men,This content mentions suicide or suicidal thou...,en
3,https://www.mentalhealth.org.uk/explore-mental...,Mental health research - what's in it for us?,This year marks70 yearssince the creation of t...,en
4,https://www.mentalhealth.org.uk/explore-mental...,The NHS Long Term Plan: progress and a way for...,NHS England has published the newNHS Long Term...,en


In [33]:
df.rename(columns={"body": "description"}, inplace=True)

df["description"] = df["description"].apply(lambda x: str(x).split(".")[0] + ".")
df.head()


Unnamed: 0,url,title,description,lang
0,https://www.mentalhealth.org.uk/explore-mental...,Tackling digital exclusion in older people,This yearâ€™s theme set by the UN is â€˜Digital Eq...,en
1,https://www.mentalhealth.org.uk/explore-mental...,Mental health advice for older people during t...,"This content mentions loneliness or isolation,...",en
2,https://www.mentalhealth.org.uk/explore-mental...,A focus on the mental health of minority men,This content mentions suicide or suicidal thou...,en
3,https://www.mentalhealth.org.uk/explore-mental...,Mental health research - what's in it for us?,This year marks70 yearssince the creation of t...,en
4,https://www.mentalhealth.org.uk/explore-mental...,The NHS Long Term Plan: progress and a way for...,NHS England has published the newNHS Long Term...,en


In [1]:
df["tags"] = ""
df["tone"] = ""
df["audience"] = ""

NameError: name 'df' is not defined

In [35]:
#Auto-suggest tags (keyword-based)

import re #regular expression library. It allows you to search for patterns (words, phrases, etc.) inside a string.

def extract_tags(text): #a function that takes one articleâ€™s text (text) and decides what tags apply
    text = text.lower() #convert to lowercase
    tags = []
    #Keyword matching, each 'if' checks whether a particular word appears in the text.
    if re.search(r"anxiety", text): tags.append("anxiety")
    if re.search(r"depression", text): tags.append("depression")
    if re.search(r"stress", text): tags.append("stress")
    if re.search(r"coping", text): tags.append("coping")
    if re.search(r"youth|teen|student", text): tags.append("youth")
    if re.search(r"family|parent", text): tags.append("family")
    if re.search(r"mindfulness|meditation", text): tags.append("mindfulness")
    if re.search(r"fear|worry", text): tags.append("fear")
    if re.search(r"mental health", text): tags.append("mental-health")

    #decide the tags
    return "|".join(tags) if tags else "general"

df["tags"] = df["description"].apply(extract_tags)
df[["title", "tags"]].head(10)

Unnamed: 0,title,tags
0,Tackling digital exclusion in older people,general
1,Mental health advice for older people during t...,general
2,A focus on the mental health of minority men,general
3,Mental health research - what's in it for us?,mental-health
4,The NHS Long Term Plan: progress and a way for...,general
5,70 years on: do we understand prevention?,mental-health
6,Reclaiming Our Heritage,mental-health
7,"Refugees: behind every statistic, there is a p...",general
8,Excellence in Youth Work: See Me Youth Champions,youth|mental-health
9,Mental health: a national asset for Scotland,general


In [36]:
#Determine the tone of each article using sentiment model

In [37]:
!pip install transformers torch



In [38]:
#label tones like joy, sadness, optimism, fear, love, anger
from transformers import pipeline

classifier = pipeline("text-classification", model="j-hartmann/emotion-english-distilroberta-base")

def ai_tone(text):
    result = classifier(text[:512])[0]  # limit to 512 tokens
    return result["label"]

df["tone"] = df["description"].apply(ai_tone)

Device set to use cpu
  return forward_call(*args, **kwargs)


In [39]:
tone_map = {
    "joy": "supportive",
    "sadness": "serious",
    "optimism": "encouraging",
    "fear": "serious",
    "love": "supportive",
    "anger": "serious"
}
df["tone"] = df["tone"].map(tone_map).fillna("neutral")


In [40]:
#Add audience column

In [41]:
def detect_audience(text):
    text = text.lower()
    if any(word in text for word in ["teen", "youth", "student", "school", "university"]):
        return "youth"
    elif any(word in text for word in ["parent", "family"]):
        return "parents"
    elif any(word in text for word in ["employee", "workplace", "manager"]):
        return "working-adults"
    else:
        return "general-public"

df["audience"] = df["description"].apply(detect_audience)
df[["title", "audience"]].head(10)


Unnamed: 0,title,audience
0,Tackling digital exclusion in older people,general-public
1,Mental health advice for older people during t...,general-public
2,A focus on the mental health of minority men,general-public
3,Mental health research - what's in it for us?,general-public
4,The NHS Long Term Plan: progress and a way for...,general-public
5,70 years on: do we understand prevention?,general-public
6,Reclaiming Our Heritage,general-public
7,"Refugees: behind every statistic, there is a p...",general-public
8,Excellence in Youth Work: See Me Youth Champions,youth
9,Mental health: a national asset for Scotland,general-public


In [58]:
import pandas as pd

# Add a new column named 'source' and fill every row with the same value
df["source"] = "Mental Health Foundation"

# Verify it worked
df.head()

Unnamed: 0,url,title,description,lang,tags,tone,audience,source
0,https://www.mentalhealth.org.uk/explore-mental...,Tackling digital exclusion in older people,This yearâ€™s theme set by the UN is â€˜Digital Eq...,en,general,neutral,general-public,Mental Health Foundation
1,https://www.mentalhealth.org.uk/explore-mental...,Mental health advice for older people during t...,"This content mentions loneliness or isolation,...",en,general,neutral,general-public,Mental Health Foundation
2,https://www.mentalhealth.org.uk/explore-mental...,A focus on the mental health of minority men,This content mentions suicide or suicidal thou...,en,general,neutral,general-public,Mental Health Foundation
3,https://www.mentalhealth.org.uk/explore-mental...,Mental health research - what's in it for us?,This year marks70 yearssince the creation of t...,en,mental-health,supportive,general-public,Mental Health Foundation
4,https://www.mentalhealth.org.uk/explore-mental...,The NHS Long Term Plan: progress and a way for...,NHS England has published the newNHS Long Term...,en,general,neutral,general-public,Mental Health Foundation


In [62]:
#save

df.to_csv("mental_health_articles_enriched.csv", index=False)
print("âœ… Saved updated dataset with new columns!")

âœ… Saved updated dataset with new columns!


In [64]:
#Data Refinement 
#only want the top ~200 highest-quality ones that are most relevant to depression and anxiety.

In [66]:
import pandas as pd

df = pd.read_csv("Mental_Health_Foundation.csv")
print(df.shape)
df.head()

(543, 8)


Unnamed: 0,url,title,description,lang,tags,tone,audience,source
0,https://www.mentalhealth.org.uk/explore-mental...,Tackling digital exclusion in older people,This yearâ€™s theme set by the UN is â€˜Digital Eq...,en,general,neutral,general-public,Mental Health Foundation
1,https://www.mentalhealth.org.uk/explore-mental...,Mental health advice for older people during t...,"This content mentions loneliness or isolation,...",en,general,neutral,general-public,Mental Health Foundation
2,https://www.mentalhealth.org.uk/explore-mental...,A focus on the mental health of minority men,This content mentions suicide or suicidal thou...,en,general,neutral,general-public,Mental Health Foundation
3,https://www.mentalhealth.org.uk/explore-mental...,Mental health research - what's in it for us?,This year marks70 yearssince the creation of t...,en,mental-health,supportive,general-public,Mental Health Foundation
4,https://www.mentalhealth.org.uk/explore-mental...,The NHS Long Term Plan: progress and a way for...,NHS England has published the newNHS Long Term...,en,general,neutral,general-public,Mental Health Foundation


In [68]:
#Basic cleaning

In [74]:
# Remove duplicates (by title or URL)
df = df.drop_duplicates(subset=["url"])

# Drop empty or missing descriptions
df = df.dropna(subset=["description"])

# Remove very short texts (less than 10 words)
df = df[df["description"].str.split().str.len() > 10]

print("âœ… After cleaning:", len(df), "articles remain")


âœ… After cleaning: 56 articles remain


In [76]:
#Expand Keyword list to catch more articles that are related to those core topics

In [86]:
#Calculate relevancy
def relevance_score(text):
    text = str(text).lower()
    score = 0
    score += len(re.findall(r"\banxiety\b", text)) * 3
    score += len(re.findall(r"\bdepression\b", text)) * 3
    score += len(re.findall(r"\bstress\b", text)) * 2
    score += len(re.findall(r"\bmental health\b", text)) * 2
    score += len(re.findall(r"\bcoping\b", text))
    score += len(re.findall(r"\btherapy|counsel|support|mood|wellbeing|well-being|mindfulness\b", text))
    score += len(re.findall(r"\bpanic|fear|self-care|sadness|worry\b", text))
    return score

In [80]:
#Allow articles that mention these topics indirectly

In [82]:
df["relevance"] = df["description"].apply(relevance_score)
df.sort_values("relevance", ascending=False, inplace=True)

# Keep top 200 even if score is low
df_top200 = df.head(200)

In [88]:
# Remove rows with relevance = 0
df = df[df["relevance"] > 0]

# Check result
print("âœ… After removing relevance = 0:", len(df), "articles remain")
df["relevance"].value_counts().sort_index()

âœ… After removing relevance = 0: 42 articles remain


relevance
1     3
2    16
3     6
4     7
5     3
6     6
7     1
Name: count, dtype: int64

In [90]:
#Save

df.to_csv("mental_health_articles_relevant.csv", index=False)
print("ðŸ’¾ Saved as 'mental_health_articles_relevant.csv'")

ðŸ’¾ Saved as 'mental_health_articles_relevant.csv'


In [92]:
# Create a new ID column starting from 1
df.insert(0, "ID", range(1, len(df) + 1))

# Check the result
df.head()

Unnamed: 0,ID,url,title,description,lang,tags,tone,audience,source,relevance
514,1,https://www.mentalhealth.org.uk/explore-mental...,Factors that affect mental health,"At the Mental Health Foundation, we know that ...",en,mental-health,neutral,general-public,Mental Health Foundation,7
190,2,https://www.mentalhealth.org.uk/explore-mental...,Scottish Mental Health Arts Festival 2025,"Location:United Kingdom, England, Northern Ire...",en,mental-health,supportive,general-public,Mental Health Foundation,6
37,3,https://www.mentalhealth.org.uk/explore-mental...,"Pride, being gay and the effects on my mental ...",This content mentions suicide or suicidal thou...,en,anxiety|depression,serious,general-public,Mental Health Foundation,6
300,4,https://www.mentalhealth.org.uk/explore-mental...,Mental health statistics,Would you like to know how rates of mental ill...,en,mental-health,neutral,general-public,Mental Health Foundation,6
141,5,https://www.mentalhealth.org.uk/explore-mental...,Racism and mental health,This content mentions suicide or suicidal thou...,en,anxiety|depression,neutral,general-public,Mental Health Foundation,6


In [94]:
df.to_csv("mental_health_articles_relevant_with_id.csv", index=False)
print("âœ… 'ID' column added and dataset saved!")


âœ… 'ID' column added and dataset saved!
