In [5]:
import pandas as pd
import tensorflow as tf
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import re

In [2]:

# Replace 'path_to_your_dataset.csv' with the actual path to your dataset
dataset_path = '../data/raw/Mental_health_dataset.csv'
df = pd.read_csv(dataset_path)

In [3]:
df.head()

Unnamed: 0,Timestamp,Content,Sentiment,Topics,Mental Health Indicator,Source Platform,Language
0,23-08-22 09:22,Feeling disconnected from everyone around me.,negative,anxiety,1,Facebook,English
1,23-01-10 03:05,Struggling to find motivation for even the sma...,negative,anxiety,1,Instagram,English
2,23-04-11 09:15,Feeling quite overwhelmed by everything. Need ...,negative,anxiety,1,Instagram,English
3,23-03-23 11:19,Had a great workout today! Feeling energized a...,positive,well-being,0,Reddit,English
4,23-09-26 19:29,Struggling to find motivation for even the sma...,negative,loneliness,1,Instagram,English


In [7]:
# Download necessary NLTK data
nltk.download('stopwords')
nltk.download('wordnet')

# Initialize lemmatizer
lemmatizer = WordNetLemmatizer()

# Define a cleaning function
def clean_text(text):
    # Remove URLs, special characters, and numbers
    text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)
    text = re.sub(r'\W+|\d+', ' ', text)
    
    # Convert text to lowercase
    text = text.lower()
    
    # Remove stopwords and apply lemmatization
    stop_words = set(stopwords.words('english'))
    tokens = text.split()
    cleaned_tokens = [lemmatizer.lemmatize(word) for word in tokens if word not in stop_words]
    
    # Join tokens back into a string
    cleaned_text = ' '.join(cleaned_tokens)
    return cleaned_text

# Apply the cleaning function to the Content column
df['Content'] = df['Content'].apply(clean_text)

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\ADMIN\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\ADMIN\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [8]:
df.head()

Unnamed: 0,Timestamp,Content,Sentiment,Topics,Mental Health Indicator,Source Platform,Language
0,23-08-22 09:22,feeling disconnected everyone around,negative,anxiety,1,Facebook,English
1,23-01-10 03:05,struggling find motivation even smallest task,negative,anxiety,1,Instagram,English
2,23-04-11 09:15,feeling quite overwhelmed everything need talk...,negative,anxiety,1,Instagram,English
3,23-03-23 11:19,great workout today feeling energized positive,positive,well-being,0,Reddit,English
4,23-09-26 19:29,struggling find motivation even smallest task,negative,loneliness,1,Instagram,English


In [9]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

# Clean and preprocess text as before
# Assume 'df' is already loaded and 'Content' column is cleaned

# Tokenize and pad sequences
tokenizer = Tokenizer(num_words=10000)
tokenizer.fit_on_texts(df['Content'])
sequences = tokenizer.texts_to_sequences(df['Content'])
X_padded = pad_sequences(sequences, maxlen=100)  # Assume maxlen=100


In [11]:
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
import numpy as np

# Sentiment and Topics as one-hot encoded arrays
sentiment_encoder = LabelEncoder()
topics_encoder = LabelEncoder()

y_sentiment = sentiment_encoder.fit_transform(df['Sentiment'])
y_topics = topics_encoder.fit_transform(df['Topics'])

onehot_encoder = OneHotEncoder()
y_sentiment = onehot_encoder.fit_transform(y_sentiment.reshape(-1, 1)).toarray()
y_topics = onehot_encoder.fit_transform(y_topics.reshape(-1, 1)).toarray()


# Mental Health Indicator as a binary label
y_mh_indicator = np.array(df['Mental Health Indicator'])


In [14]:
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Embedding, LSTM, Dense

# Input layer
input_layer = Input(shape=(100,))  # 100 is the maxlen of padded sequences

# Shared layers
embedding_layer = Embedding(input_dim=10000, output_dim=128)(input_layer)
lstm_layer = LSTM(64)(embedding_layer)

# Output layers
sentiment_output = Dense(y_sentiment.shape[1], activation='softmax', name='sentiment')(lstm_layer)
topics_output = Dense(y_topics.shape[1], activation='softmax', name='topics')(lstm_layer)
mh_indicator_output = Dense(1, activation='sigmoid', name='mh_indicator')(lstm_layer)

# Build the model
model = Model(inputs=input_layer, outputs=[sentiment_output, topics_output, mh_indicator_output])

# Compile the model
model.compile(optimizer='adam',
              loss={'sentiment': 'categorical_crossentropy',
                    'topics': 'categorical_crossentropy',
                    'mh_indicator': 'binary_crossentropy'},
              metrics={'sentiment': ['accuracy'],
                       'topics': ['accuracy'],
                       'mh_indicator': ['accuracy']})


In [15]:
model.fit(X_padded, {'sentiment': y_sentiment, 'topics': y_topics, 'mh_indicator': y_mh_indicator},
          batch_size=32, epochs=10, validation_split=0.2)


Epoch 1/10
[1m1250/1250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m64s[0m 49ms/step - loss: 1.3159 - mh_indicator_accuracy: 0.9673 - sentiment_accuracy: 0.9707 - topics_accuracy: 0.3260 - val_loss: 1.1019 - val_mh_indicator_accuracy: 1.0000 - val_sentiment_accuracy: 1.0000 - val_topics_accuracy: 0.3387
Epoch 2/10
[1m1250/1250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m64s[0m 51ms/step - loss: 1.1058 - mh_indicator_accuracy: 1.0000 - sentiment_accuracy: 1.0000 - topics_accuracy: 0.3266 - val_loss: 1.1022 - val_mh_indicator_accuracy: 1.0000 - val_sentiment_accuracy: 1.0000 - val_topics_accuracy: 0.3335
Epoch 3/10
[1m1250/1250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m61s[0m 49ms/step - loss: 1.1033 - mh_indicator_accuracy: 1.0000 - sentiment_accuracy: 1.0000 - topics_accuracy: 0.3390 - val_loss: 1.1021 - val_mh_indicator_accuracy: 1.0000 - val_sentiment_accuracy: 1.0000 - val_topics_accuracy: 0.3367
Epoch 4/10
[1m1250/1250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [

<keras.src.callbacks.history.History at 0x2316b7825c0>

In [16]:
# Assume X_test, y_test_sentiment, y_test_topics, and y_test_mh_indicator are prepared
model.evaluate(X_test, {'sentiment': y_test_sentiment, 'topics': y_test_topics, 'mh_indicator': y_test_mh_indicator})


NameError: name 'X_test' is not defined

In [19]:
def predict_new_input(model, tokenizer, new_input):
    # Tokenize and pad the new input
    seq = tokenizer.texts_to_sequences([new_input])
    padded = pad_sequences(seq, maxlen=100)
    
    # Predict
    prediction = model.predict(padded)
    sentiment_pred, topics_pred, mh_indicator_pred = prediction
    
    # Decode predictions
    sentiment_label = sentiment_encoder.inverse_transform([np.argmax(sentiment_pred)])
    topics_label = topics_encoder.inverse_transform([np.argmax(topics_pred)])
    mh_indicator_label = (mh_indicator_pred > 0.5).astype(int)
    
    return sentiment_label, topics_label, mh_indicator_label

# Example usage
# new_input = "I'm feeling quite happy today, but a bit anxious about tomorrow."
custom_texts = [
    "Just finished a great book on ancient history and I'm feeling inspired!",
    "Lately, I've been feeling overwhelmed with worry about things that are out of my control.",
    "Starting meditation has significantly improved my overall sense of well-being.",
    "The weather has been quite unpredictable this week, with rain and sunshine alternating.",
    "No matter what I do, there's a persistent feeling of sadness that I can't seem to shake off.",
    "After months of therapy, I'm finally starting to see improvements in how I feel about myself and my life.",
    "Recent studies suggest that spending time in nature can have a positive effect on mental health.",
    "Deadlines are approaching fast, and I'm starting to doubt if I can handle the pressure.",
    "I am so grateful for the support group I've found; it's comforting to know I'm not alone in this journey.",
    "I'm hopeful about the future but anxious about the changes it might bring to my personal life and mental health."
]
for new_input in custom_texts:
    print(predict_new_input(model, tokenizer, new_input))


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 52ms/step
(array(['negative'], dtype=object), array(['depression'], dtype=object), array([[1]]))
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 41ms/step
(array(['negative'], dtype=object), array(['depression'], dtype=object), array([[1]]))
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 42ms/step
(array(['negative'], dtype=object), array(['depression'], dtype=object), array([[1]]))
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 50ms/step
(array(['negative'], dtype=object), array(['depression'], dtype=object), array([[1]]))
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 47ms/step
(array(['negative'], dtype=object), array(['depression'], dtype=object), array([[1]]))
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 44ms/step
(array(['negative'], dtype=object), array(['depression'], dtype=object), array([[1]]))
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m