In [1]:
import numpy as np
import pandas as pd

from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation

import os
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk import pos_tag, word_tokenize
import emoji

# Download NLTK resources
nltk.download('averaged_perceptron_tagger')
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt_tab')
nltk.download('averaged_perceptron_tagger_eng')
nltk.download('omw-1.4')

# Initialize stop words and lemmatizer
stop_words = set(stopwords.words("english"))
lemmatizer = WordNetLemmatizer()

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\Kiran\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Kiran\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Kiran\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Kiran\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\Kiran\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     C:\Users\Kiran\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_pe

In [2]:
# Load and preprocess data
dataset_dir = os.path.join('..', 'Dataset')
data_path = os.path.join(dataset_dir, 'Suicide_Detection.csv')

data = pd.read_csv(data_path)
data.head()

Unnamed: 0.1,Unnamed: 0,text,class
0,2,Ex Wife Threatening SuicideRecently I left my ...,suicide
1,3,Am I weird I don't get affected by compliments...,non-suicide
2,4,Finally 2020 is almost over... So I can never ...,non-suicide
3,8,i need helpjust help me im crying so hard,suicide
4,9,"I’m so lostHello, my name is Adam (16) and I’v...",suicide


In [3]:
data['class'].replace({'suicide':1 , 'non-suicide':0} , inplace = True)
data.head()

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  data['class'].replace({'suicide':1 , 'non-suicide':0} , inplace = True)
  data['class'].replace({'suicide':1 , 'non-suicide':0} , inplace = True)


Unnamed: 0.1,Unnamed: 0,text,class
0,2,Ex Wife Threatening SuicideRecently I left my ...,1
1,3,Am I weird I don't get affected by compliments...,0
2,4,Finally 2020 is almost over... So I can never ...,0
3,8,i need helpjust help me im crying so hard,1
4,9,"I’m so lostHello, my name is Adam (16) and I’v...",1


In [4]:
texts = data['text'].values
labels = data['class'].values

In [5]:
def process_text(text):
    # Convert emojis to text
    text = emoji.demojize(text, delimiters=(" ", " "))

    # Remove punctuation
    text = re.sub(r'[^\w\s]', '', text)

    # Tokenize and remove stop words, apply lemmatization
    tokens = text.split()
    tokens = [lemmatizer.lemmatize(word.lower()) for word in tokens if word.lower() not in stop_words]

    # Join tokens back into a string
    processed_text = " ".join(tokens)

    return processed_text

# def process_text_data(text_data):
#     # Check if text_data is a numpy array
#     if isinstance(text_data, np.ndarray):
#         processed_data = np.array([process_text(text) for text in text_data])
#     else:
#         raise TypeError("Input should be a numpy.ndarray")
#     return processed_data


In [6]:
text_data = texts.tolist()

# processed_data = process_text_data(text_data)
processed_data = [process_text(text) for text in text_data]
# for original, processed in zip(text_data[:2], processed_data[:2]):
#     print("Original:", original)
#     print("Processed:", processed)
#     print()

In [7]:
text_data[-1], processed_data[-1]

("I still haven't beaten the first boss in Hollow Knight. I've only fought it a few times and I always die really early in the fight. I'm terrible at this game y'all. :(",
 'still havent beaten first bos hollow knight ive fought time always die really early fight im terrible game yall')

In [8]:
data['process_text'] = processed_data
data.head()

Unnamed: 0.1,Unnamed: 0,text,class,process_text
0,2,Ex Wife Threatening SuicideRecently I left my ...,1,ex wife threatening suiciderecently left wife ...
1,3,Am I weird I don't get affected by compliments...,0,weird dont get affected compliment coming some...
2,4,Finally 2020 is almost over... So I can never ...,0,finally 2020 almost never hear 2020 bad year e...
3,8,i need helpjust help me im crying so hard,1,need helpjust help im cry hard
4,9,"I’m so lostHello, my name is Adam (16) and I’v...",1,im losthello name adam 16 ive struggling year ...


In [9]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 232074 entries, 0 to 232073
Data columns (total 4 columns):
 #   Column        Non-Null Count   Dtype 
---  ------        --------------   ----- 
 0   Unnamed: 0    232074 non-null  int64 
 1   text          232074 non-null  object
 2   class         232074 non-null  int64 
 3   process_text  232074 non-null  object
dtypes: int64(2), object(2)
memory usage: 7.1+ MB


In [10]:
process_data_path = os.path.join(dataset_dir, 'Process_Suicide_Detection.csv')
data.to_csv(process_data_path, index=False)

# Load process_data stage 1

In [11]:
data = pd.read_csv(process_data_path)
data.head()

Unnamed: 0.1,Unnamed: 0,text,class,process_text
0,2,Ex Wife Threatening SuicideRecently I left my ...,1,ex wife threatening suiciderecently left wife ...
1,3,Am I weird I don't get affected by compliments...,0,weird dont get affected compliment coming some...
2,4,Finally 2020 is almost over... So I can never ...,0,finally 2020 almost never hear 2020 bad year e...
3,8,i need helpjust help me im crying so hard,1,need helpjust help im cry hard
4,9,"I’m so lostHello, my name is Adam (16) and I’v...",1,im losthello name adam 16 ive struggling year ...


In [12]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 232074 entries, 0 to 232073
Data columns (total 4 columns):
 #   Column        Non-Null Count   Dtype 
---  ------        --------------   ----- 
 0   Unnamed: 0    232074 non-null  int64 
 1   text          232074 non-null  object
 2   class         232074 non-null  int64 
 3   process_text  232050 non-null  object
dtypes: int64(2), object(2)
memory usage: 7.1+ MB


In [13]:
# show null values
data[data.isna().any(axis=1)]

Unnamed: 0.1,Unnamed: 0,text,class,process_text
408,608,︎ ︎ ︎ ︎ ︎ ︎ ︎ ︎ ︎ ︎ ︎ ︎ ︎ ︎ ︎ ︎ ︎ ︎ ︎ ︎ ︎\n︎\...,0,
9592,14410,͏ ͏ ͏ ͏ ͏ ͏ ͏ ͏ ͏ ͏ ͏ ͏ ͏ ͏ ͏ ͏ ͏ ͏͏͏͏ ͏ ͏ ͏ ͏...,0,
20544,30931,... --- ... / .... . .-.. .--. / .--. .-.. . ....,1,
21188,31886,:/:/,1,
26469,39747,I DID IT \n\n\n\n\nNot,0,
35823,53714,‍ ‍ ‍ ‍ ‍ ‍ ‍ ‍ ‍ ‍ ‍ ‍ ‍ ‍ ‍ ‍ ‍ ‍ ‍ ‍ ‍ ‍ ‍ ...,0,
55136,82678,this is me when the when the,0,
77776,116797,︎ ︎ ︎ ︎ ︎ ︎ ︎ ︎ ︎ ︎ ︎ ︎ ︎ ︎ ︎ ︎ ︎ ︎ ︎ ︎ ︎ ︎ ︎ ...,0,
81678,122656,‍ ‍ ‍ ‍ ‍ ‍ ‍ ‍ ‍ ‍ ‍ ‍ ‍ ‍ ‍ ‍ ‍ ‍ ‍ ‍ ‍ ‍ ‍ ...,0,
102482,153973,:(again and again,1,


In [15]:
# drop na value
data.dropna().reset_index(drop=True, inplace=True)
data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 232050 entries, 0 to 232073
Data columns (total 4 columns):
 #   Column        Non-Null Count   Dtype 
---  ------        --------------   ----- 
 0   Unnamed: 0    232050 non-null  int64 
 1   text          232050 non-null  object
 2   class         232050 non-null  int64 
 3   process_text  232050 non-null  object
dtypes: int64(2), object(2)
memory usage: 8.9+ MB


In [16]:
# save stage 1
process_data_path = os.path.join(dataset_dir, 'Process_Stage1_Suicide_Detection.csv')
data.to_csv(process_data_path, index=False)

# EDA word cloud

# Load process_data stage 2

In [17]:
data = pd.read_csv(process_data_path)
data.head()

Unnamed: 0.1,Unnamed: 0,text,class,process_text
0,2,Ex Wife Threatening SuicideRecently I left my ...,1,ex wife threatening suiciderecently left wife ...
1,3,Am I weird I don't get affected by compliments...,0,weird dont get affected compliment coming some...
2,4,Finally 2020 is almost over... So I can never ...,0,finally 2020 almost never hear 2020 bad year e...
3,8,i need helpjust help me im crying so hard,1,need helpjust help im cry hard
4,9,"I’m so lostHello, my name is Adam (16) and I’v...",1,im losthello name adam 16 ive struggling year ...


In [18]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 232050 entries, 0 to 232049
Data columns (total 4 columns):
 #   Column        Non-Null Count   Dtype 
---  ------        --------------   ----- 
 0   Unnamed: 0    232050 non-null  int64 
 1   text          232050 non-null  object
 2   class         232050 non-null  int64 
 3   process_text  232050 non-null  object
dtypes: int64(2), object(2)
memory usage: 7.1+ MB


In [28]:
%%time
# TF-IDF
# tfidf_vectorizer = TfidfVectorizer(max_features=5000)
tfidf_vectorizer = TfidfVectorizer(max_features=100)
tfidf_features = tfidf_vectorizer.fit_transform(data['process_text'])
tfidf_df = pd.DataFrame(tfidf_features.toarray(), columns=tfidf_vectorizer.get_feature_names_out())
tfidf_df['class'] = data['class']

tfidf_df.head()

CPU times: total: 11.8 s
Wall time: 13.4 s


Unnamed: 0,also,always,anymore,anyone,anything,around,back,bad,better,cant,...,told,tried,try,want,way,well,work,would,year,class
0,0.0,0.0,0.0,0.0,0.0,0.0,0.36768,0.0,0.0,0.0,...,0.0,0.0,0.0,0.126111,0.174135,0.0,0.0,0.0,0.0,1
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.484986,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.380745,0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1
4,0.0,0.0,0.064171,0.064511,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.164944,1


In [29]:
tfidf_features_data_path = os.path.join(dataset_dir, 'tfidf_features_50.csv')
tfidf_df.to_csv(tfidf_features_data_path, index=False)

In [24]:
# %%time
# # POS Tagging
# pos_tagged_data = [pos_tag(word_tokenize(text)) for text in data['process_text']]
# pos_tagged_df = pd.DataFrame(pos_tagged_data, columns=["Word", "POS"])

# pos_tagged_df.head()

# MemoryError: Unable to allocate 31.4 GiB for an array with shape (232050, 18146) and data type object

In [30]:
%%time
# Bag of H-grams (unigrams, bigrams, trigrams)
count_vectorizer = CountVectorizer(ngram_range=(1, 3), max_features=100)
ngram_features = count_vectorizer.fit_transform(data['process_text'])
ngram_df = pd.DataFrame(ngram_features.toarray(), columns=count_vectorizer.get_feature_names_out())
ngram_df['class'] = data['class']

ngram_df.head()

CPU times: total: 2min 15s
Wall time: 2min 24s


Unnamed: 0,also,always,anymore,anyone,anything,around,back,bad,better,cant,...,time,told,tried,try,want,way,work,would,year,class
0,0,0,0,0,0,0,2,0,0,0,...,0,0,0,0,1,1,0,0,0,1
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,1,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
4,0,0,1,1,0,0,0,0,0,0,...,2,0,0,0,0,0,0,0,3,1


In [31]:
ngram_df_features_data_path = os.path.join(dataset_dir, 'ngram_df_features_50.csv')
ngram_df.to_csv(ngram_df_features_data_path, index=False)

In [32]:
%%time
# Latent Dirichlet Allocation (LDA)
lda = LatentDirichletAllocation(n_components=5, random_state=42)
lda_features = lda.fit_transform(ngram_features)
lda_df = pd.DataFrame(lda_features, columns=[f"Topic_{i+1}" for i in range(lda.n_components)])
lda_df['class'] = data['class']

lda_df.head()

CPU times: total: 11min 11s
Wall time: 11min 11s


Unnamed: 0,Topic_1,Topic_2,Topic_3,Topic_4,Topic_5,class
0,0.008016,0.289972,0.008001,0.008242,0.68577,1
1,0.025,0.898189,0.025003,0.026267,0.025541,0
2,0.033333,0.034386,0.033334,0.237204,0.661744,0
3,0.050321,0.051255,0.05001,0.796905,0.051509,1
4,0.002778,0.26927,0.002778,0.002833,0.722341,1


In [33]:
lda_df_features_data_path = os.path.join(dataset_dir, 'lda_df_features_50.csv')
lda_df.to_csv(lda_df_features_data_path, index=False)

# Combine all features

In [38]:
# load all data
tfidf_df = pd.read_csv(tfidf_features_data_path)
tfidf_df = tfidf_df.drop(['class'], axis = 1)

ngram_df = pd.read_csv(ngram_df_features_data_path)
ngram_df = ngram_df.drop(['class'], axis = 1)

lda_df = pd.read_csv(lda_df_features_data_path)
# lda_df = lda_df.drop(['class'], axis = 1)

combined_df = pd.concat([tfidf_df, ngram_df, lda_df], axis=1)
combined_df

Unnamed: 0,also,always,anymore,anyone,anything,around,back,bad,better,cant,...,way,work,would,year,Topic_1,Topic_2,Topic_3,Topic_4,Topic_5,class
0,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.367680,0.000000,0.000000,0.000000,...,1,0,0,0,0.008016,0.289972,0.008001,0.008242,0.685770,1
1,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0,0,0,0,0.025000,0.898189,0.025003,0.026267,0.025541,0
2,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.484986,0.000000,0.000000,...,0,0,0,1,0.033333,0.034386,0.033334,0.237204,0.661744,0
3,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0,0,0,0,0.050321,0.051255,0.050010,0.796905,0.051509,1
4,0.0,0.000000,0.064171,0.064511,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0,0,0,3,0.002778,0.269270,0.002778,0.002833,0.722341,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
232045,0.0,0.000000,0.000000,0.000000,0.499350,0.000000,0.000000,0.000000,0.000000,0.000000,...,0,0,0,0,0.028572,0.029708,0.028575,0.603239,0.309907,0
232046,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0,0,0,0,0.040000,0.838003,0.040002,0.040844,0.041150,0
232047,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0,0,0,0,0.066667,0.731766,0.066676,0.067825,0.067066,0
232048,0.0,0.000000,0.000000,0.000000,0.081052,0.186823,0.084033,0.000000,0.084835,0.065762,...,0,0,4,0,0.002667,0.002757,0.002667,0.329361,0.662549,1


In [44]:
combined_df_features_data_path = os.path.join(dataset_dir, 'combined_df.csv')
combined_df.to_csv(combined_df_features_data_path, index=False)

# Train and evaluate the combined features using a Distributed CNN-BiLSTM with a Hybrid Attention Module

1. Embedding Layer: For embedding each word in a dense vector space.
2. Convolutional Layer: For extracting local features using 1D convolutions.
3. BiLSTM Layer: For capturing contextual information from both past and future.
4. Hybrid Attention Module: For allowing the model to focus on important parts of the sequence dynamically.
5. Dense Layer: For classification.

In [41]:
import numpy as np

import tensorflow as tf
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Embedding, Conv1D, MaxPooling1D, Bidirectional, LSTM, Dense, Attention, Flatten, Concatenate, Dropout
from tensorflow.keras.optimizers import Adam

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report


### build_cnn_bilstm_attention_model

In [42]:
def build_cnn_bilstm_attention_model(input_shape):
    """
    Builds a CNN-BiLSTM model with a Hybrid Attention module for sequence classification.
    
    Parameters:
    input_shape (tuple): Shape of the input data (sequence length, feature size).

    Returns:
    model: Compiled Keras model ready for training.
    """
    # Input Layer
    inputs = Input(shape=input_shape)

    # Convolutional Layer for local feature extraction
    x = Conv1D(128, kernel_size=3, activation='relu', padding='same')(inputs)
    x = MaxPooling1D(pool_size=2)(x)  # Reduces the dimensionality

    # BiLSTM Layer for capturing contextual dependencies from both directions
    x = Bidirectional(LSTM(64, return_sequences=True))(x)

    # Hybrid Attention Module: Attention layer that computes importance scores
    attention_data = Attention()([x, x])
    x = Concatenate()([x, attention_data])  # Concatenates LSTM output with attention output

    # Flatten and fully connected layer for classification
    x = Flatten()(x)
    x = Dense(64, activation='relu')(x)
    x = Dropout(0.5)(x)  # Dropout layer to prevent overfitting
    outputs = Dense(1, activation='sigmoid')(x)  # Sigmoid for binary classification

    # Compile model with Adam optimizer and binary crossentropy loss
    model = Model(inputs, outputs)
    model.compile(optimizer=Adam(learning_rate=0.001), loss='binary_crossentropy', metrics=['accuracy'])

    return model

### train_and_evaluate_model

In [43]:
def train_and_evaluate_model(data, labels):
    """
    Splits data into training and test sets, trains the CNN-BiLSTM model with attention,
    and evaluates it on the test set.

    Parameters:
    data (DataFrame or ndarray): Feature matrix.
    labels (ndarray): Binary target labels.

    Returns:
    model: Trained Keras model.
    metrics: Dictionary containing accuracy and classification report.
    """
    # Split data into training and testing sets (80-20 split)
    X_train, X_test, y_train, y_test = train_test_split(data, labels, test_size=0.2, random_state=42, stratify=labels)

    # Reshape data to 3D format (samples, sequence length, features)
    # CNN and LSTM layers expect input with shape (batch_size, sequence_length, num_features)
    X_train = np.expand_dims(X_train, axis=2)
    X_test = np.expand_dims(X_test, axis=2)

    # Build the model
    model = build_cnn_bilstm_attention_model(X_train.shape[1:])

    # Train the model with early stopping to prevent overfitting
    model.fit(X_train, y_train, epochs=20, batch_size=32, validation_split=0.1, verbose=1)

    # Predictions and evaluation
    y_pred = (model.predict(X_test) > 0.5).astype("int32")  # Binarize predictions
    accuracy = accuracy_score(y_test, y_pred)
    report = classification_report(y_test, y_pred)

    # Metrics dictionary for easy access to accuracy and classification report
    metrics = {
        "accuracy": accuracy,
        "classification_report": report
    }

    return model, metrics

In [45]:
# Assuming `combined_features` is your feature DataFrame and `labels` is your target labels array
combined_df = pd.read_csv(combined_df_features_data_path)
data = combined_df.drop(['class'], axis = 1) # combined_features.values  # Convert DataFrame to ndarray for processing
labels = combined_df['class'] # Replace with actual labels array

model, metrics = train_and_evaluate_model(data, labels)
print("Model Accuracy:", metrics["accuracy"])
print("\nClassification Report:\n", metrics["classification_report"])

Epoch 1/10
[1m5222/5222[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m726s[0m 138ms/step - accuracy: 0.8467 - loss: 0.3679 - val_accuracy: 0.8742 - val_loss: 0.3124
Epoch 2/10
[1m5222/5222[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m658s[0m 126ms/step - accuracy: 0.8708 - loss: 0.3232 - val_accuracy: 0.8731 - val_loss: 0.3107
Epoch 3/10
[1m5222/5222[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m569s[0m 109ms/step - accuracy: 0.8723 - loss: 0.3191 - val_accuracy: 0.8718 - val_loss: 0.3152
Epoch 4/10
[1m5222/5222[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m517s[0m 99ms/step - accuracy: 0.8735 - loss: 0.3174 - val_accuracy: 0.8756 - val_loss: 0.3076
Epoch 5/10
[1m5222/5222[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m603s[0m 115ms/step - accuracy: 0.8759 - loss: 0.3124 - val_accuracy: 0.8773 - val_loss: 0.3050
Epoch 6/10
[1m5222/5222[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m729s[0m 140ms/step - accuracy: 0.8769 - loss: 0.3111 - val_accuracy: 0.8748 - val_loss: 

In [46]:
# Save the model
model.save("Distributed_CNN_BiLSTM_with_Hybrid_Attention_suicidal_ideation_model.h5")

