In [1]:
#Environment Setup Step

In [2]:
!python --version

Python 3.12.7


In [3]:
#Install necessary packages
!pip install pandas scikit-learn nltk



In [4]:
#Install Flask for API
!pip install flask



In [5]:
#Import Libraries

import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel
import nltk

print("All libraries imported successfully!")

All libraries imported successfully!


In [6]:
#Download NLTK resources to download English stopwords
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\USER\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [7]:
#Check if everything is ready
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer

STOP = set(stopwords.words('english'))
print(f"Loaded {len(STOP)} stopwords.")

Loaded 198 stopwords.


In [8]:
#Text Preprocessing Part

In [9]:
#Import Basic Setups for text cleaning 
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer

# Download stopwords (only first time)
nltk.download('stopwords')

STOP = set(stopwords.words('english'))
STEMMER = PorterStemmer()


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\USER\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [10]:
#Preprocessing function

def preprocess(text):
    """
    Clean and normalize text for TF-IDF.
    Steps: lowercase, remove punctuation, remove stopwords, stemming.
    """
    if not isinstance(text, str):
        return ""
    
    # lowercase
    text = text.lower()
    
    # keep only letters/numbers/spaces
    text = re.sub(r'[^a-z0-9\s]', ' ', text)
    
    # tokenize
    tokens = text.split()
    
    # remove stopwords
    tokens = [w for w in tokens if w not in STOP]
    
    # stemming
    stems = [STEMMER.stem(t) for t in tokens]
    
    return " ".join(stems)


In [11]:
#Sample Test
sample_sentences = [
    "This platform provides mental-health resources for Malaysian youth.",
    "Find quick breathing exercises and stress-relief activities!",
    "Our system does *not* offer diagnostic advice."
]

for s in sample_sentences:
    print("Original:", s)
    print("Processed:", preprocess(s))
    print()


Original: This platform provides mental-health resources for Malaysian youth.
Processed: platform provid mental health resourc malaysian youth

Original: Find quick breathing exercises and stress-relief activities!
Processed: find quick breath exercis stress relief activ

Original: Our system does *not* offer diagnostic advice.
Processed: system offer diagnost advic



In [12]:
#Import Data
import pandas as pd

# Path to your dataset
file_path = r"C:\Users\USER\Desktop\FYP\Dataset\Mental_Health_Resources.csv"

# Load dataset
df = pd.read_csv(file_path)

# Display first few rows
df.head()

Unnamed: 0,ID,Title,Description,URL,Format,Tags,Tone,Audience,Source,Language
0,1,How to Manage Fear and Anxiety,Comprehensive guide explaining the origins of ...,https://www.mentalhealth.org.uk/explore-mental...,Guide,anxiety|coping|fear|mental-health,supportive,all,Mental Health Foundation,English
1,2,Doomscrolling - Tips for Healthier News Consum...,Article examining the mental health effects of...,https://www.mentalhealth.org.uk/explore-mental...,Article,anxiety|digital-wellbeing|stress|media,informative,all,Mental Health Foundation,English
2,3,How to Sleep Better,Practical guide offering evidence-based tips f...,https://www.mentalhealth.org.uk/explore-mental...,Guide,sleep|insomnia|rest|wellness,calm,all,Mental Health Foundation,English
3,4,Looking After Your Mental Health: A Guide for ...,Youth-focused guidance to help understand ment...,https://www.mentalhealth.org.uk/explore-mental...,Guide,mental-health|youth|self-care|wellbeing,supportive,young_adults,Mental Health Foundation,English
4,5,Grounding Techniques for Anxiety,Article about evidence-based grounding techniq...,https://www.therapistaid.com/therapy-article/g...,Article,anxiety|grounding|coping|mindfulness,calm,all,Therapist Aid,English


In [13]:
df.columns

Index(['ID', 'Title', 'Description', 'URL', 'Format', 'Tags', 'Tone',
       'Audience', 'Source', 'Language'],
      dtype='object')

In [14]:
#Combine relevant columns and clean text

df['combined'] = (
    df['Title'].fillna('') + ' ' +
    df['Description'].fillna('') + ' ' +
    df['Tags'].fillna('') + ' ' +
    df['Format'].fillna('') + ' ' +
    df['Tone'].fillna('') + ' ' +
    df['Audience'].fillna('') + ' ' +
    df['Language'].fillna('')
)

# Apply your preprocessing function
df['clean'] = df['combined'].apply(preprocess)

# Check the result
df[['Title', 'clean']].head()

Unnamed: 0,Title,clean
0,How to Manage Fear and Anxiety,manag fear anxieti comprehens guid explain ori...
1,Doomscrolling - Tips for Healthier News Consum...,doomscrol tip healthier news consumpt articl e...
2,How to Sleep Better,sleep better practic guid offer evid base tip ...
3,Looking After Your Mental Health: A Guide for ...,look mental health guid young peopl youth focu...
4,Grounding Techniques for Anxiety,ground techniqu anxieti articl evid base groun...


In [15]:
#Save the cleaned Dataset so won’t have to clean again later
df.to_csv(r"C:\Users\USER\Desktop\FYP\Dataset\Mental_Health_Resources_cleaned.csv", index=False)

In [16]:
#TF-IDF Vectorization + Similarity Setup

In [17]:
#Import required modules

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel

In [18]:
#Create and fit the TF-IDF vectorizer

# Create TF-IDF vectorizer
vectorizer = TfidfVectorizer(
    max_features=5000,       # limit vocab size to reduce noise
    ngram_range=(1, 2)       # include unigrams + bigrams
)

# Fit and transform the cleaned text column
tfidf_matrix = vectorizer.fit_transform(df['clean'])

print("TF-IDF matrix shape:", tfidf_matrix.shape)


TF-IDF matrix shape: (740, 5000)


In [19]:
#meaning 740 resources × 5000 features

In [20]:
#Define the recommendation function
def recommend(query, top_n=5):
    """
    Recommend the most relevant mental-health resources based on user input.
    """
    # Clean and vectorize user query using same preprocessing & vectorizer
    q_clean = preprocess(query)
    q_vec = vectorizer.transform([q_clean])

    # Compute cosine similarity between query and all resources
    cosine_sim = linear_kernel(q_vec, tfidf_matrix).flatten()

    # Get top N indices sorted by similarity
    top_indices = cosine_sim.argsort()[::-1][:top_n]

    # Prepare results
    results = []
    for idx in top_indices:
        results.append({
            'Title': df.iloc[idx]['Title'],
            'Description': df.iloc[idx]['Description'],
            'URL': df.iloc[idx]['URL'],
            'Score': round(float(cosine_sim[idx]), 4)
        })
    return results


In [21]:
#Test Recommender With simple Query
query = "stress relief breathing exercises for teenagers"
recommendations = recommend(query, top_n=5)

for r in recommendations:
    print(f"Score: {r['Score']:.3f} | {r['Title']}")
    print(f"→ {r['Description'][:120]}...")
    print(f"URL: {r['URL']}")
    print("-" * 80)

Score: 0.418 | 4-7-8 Breathing Technique Video
→ Quick tutorial on the 4-7-8 breathing technique for immediate anxiety and stress relief....
URL: https://www.youtube.com/watch?v=gz4G31LGyog
--------------------------------------------------------------------------------
Score: 0.377 | Box Breathing Technique Video
→ Short tutorial demonstrating the box breathing method for immediate anxiety and stress relief....
URL: https://www.youtube.com/watch?v=tEmt1Znux58
--------------------------------------------------------------------------------
Score: 0.377 | Box Breathing Technique Video
→ Short tutorial demonstrating the box breathing method for immediate anxiety and stress relief....
URL: https://www.youtube.com/watch?v=tEmt1Znux58
--------------------------------------------------------------------------------
Score: 0.371 | Mindful Breathing Techniques Video
→ Video demonstrating various breathing techniques for immediate anxiety and stress relief....
URL: https://www.youtube.com/watch

In [22]:
#The system returned five recommendations, ranked by a numerical Score, 
#indicating their relevance to the query. Higher scores mean greater relevance.

"""
Observations
Focus on Videos: The top four results are YouTube videos (indicated by the URL structure), 
suggesting the recommender might be heavily weighted toward video content or that videos are the most relevant result type for this query.

Breathing Techniques: The results cover popular stress-relief methods like the 4-7-8 and Box Breathing techniques.

Duplicate Entry: The second and third entries are identical: same score, title, description, and URL. 
This is a common occurrence in recommendation systems, especially if the underlying data source contains 
identical or highly similar content items with the same ranking.

Government Source: The last result links to a government website (moh.gov.my/), which is likely the Malaysian Ministry of Health,
providing a non-video source for anxiety relief techniques.
"""


'\nObservations\nFocus on Videos: The top four results are YouTube videos (indicated by the URL structure), \nsuggesting the recommender might be heavily weighted toward video content or that videos are the most relevant result type for this query.\n\nBreathing Techniques: The results cover popular stress-relief methods like the 4-7-8 and Box Breathing techniques.\n\nDuplicate Entry: The second and third entries are identical: same score, title, description, and URL. \nThis is a common occurrence in recommendation systems, especially if the underlying data source contains \nidentical or highly similar content items with the same ranking.\n\nGovernment Source: The last result links to a government website (moh.gov.my/), which is likely the Malaysian Ministry of Health,\nproviding a non-video source for anxiety relief techniques.\n'

In [23]:
#Save trained objects (optional, for later API use)

import joblib

joblib.dump(vectorizer, r"C:\Users\USER\Desktop\FYP\Dataset\tfidf_vectorizer.pkl")
joblib.dump(df, r"C:\Users\USER\Desktop\FYP\Dataset\cleaned_dataset.pkl")


['C:\\Users\\USER\\Desktop\\FYP\\Dataset\\cleaned_dataset.pkl']

In [24]:
#Flask app or future scripts can load them directly later.