In [27]:
import pandas as pd
import numpy as np
import os

import tensorflow as tf
from tensorflow.keras.layers import Embedding, Conv1D, GlobalMaxPooling1D, Bidirectional, LSTM, Dense, Dropout, Input
from tensorflow.keras.models import Model
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

In [28]:
# Load and preprocess data
dataset_dir = os.path.join('..', 'Dataset')
data_path = os.path.join(dataset_dir, 'Suicide_Detection.csv')

data = pd.read_csv(data_path)

data.head()

Unnamed: 0.1,Unnamed: 0,text,class
0,2,Ex Wife Threatening SuicideRecently I left my ...,suicide
1,3,Am I weird I don't get affected by compliments...,non-suicide
2,4,Finally 2020 is almost over... So I can never ...,non-suicide
3,8,i need helpjust help me im crying so hard,suicide
4,9,"I’m so lostHello, my name is Adam (16) and I’v...",suicide


In [29]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 232074 entries, 0 to 232073
Data columns (total 3 columns):
 #   Column      Non-Null Count   Dtype 
---  ------      --------------   ----- 
 0   Unnamed: 0  232074 non-null  int64 
 1   text        232074 non-null  object
 2   class       232074 non-null  object
dtypes: int64(1), object(2)
memory usage: 5.3+ MB


In [30]:
data['class'].value_counts()

class
suicide        116037
non-suicide    116037
Name: count, dtype: int64

In [31]:
# Find duplicates based on the 'Name' column only
duplicates_by_text = data[data.duplicated(subset=['text'])]
print("Duplicates based on Name column:\n", duplicates_by_text)

Duplicates based on Name column:
 Empty DataFrame
Columns: [Unnamed: 0, text, class]
Index: []


In [32]:
data['class'].replace({'suicide':1 , 'non-suicide':0} , inplace = True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  data['class'].replace({'suicide':1 , 'non-suicide':0} , inplace = True)
  data['class'].replace({'suicide':1 , 'non-suicide':0} , inplace = True)


In [33]:
texts = data['text'].values
labels = data['class'].values

In [34]:
# Tokenization and padding
max_words = 20000  # Adjust based on vocabulary size
max_len = 200  # Adjust based on average post length

tokenizer = Tokenizer(num_words=max_words)
tokenizer.fit_on_texts(texts)
sequences = tokenizer.texts_to_sequences(texts)
x_data = pad_sequences(sequences, maxlen=max_len)
y_data = np.array(labels)

In [35]:
x_data

array([[   0,    0,    0, ...,    9,  246,  424],
       [   0,    0,    0, ..., 1573,   27,    9],
       [   0,    0,    0, ...,   74,   18, 1027],
       ...,
       [   0,    0,    0, ..., 2794,  155, 5297],
       [   9,   38,   19, ...,   98,    4,   82],
       [   0,    0,    0, ...,   17,  555, 1518]], dtype=int32)

In [36]:
y_data

array([1, 0, 0, ..., 0, 1, 0])

In [37]:
x_data[0], len(x_data[0])

(array([    0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,   541,   740,  2470,     1,
          237,     6,   740,    14,    99,    45,    43,    86,  1384,
           25,     8,  1040,     3,  1429,     2,     8,    18,    81,
           10,     1,    15,   418,     2,  1813,     2,    70,   110,
            2,    49,    48,     7,     5,   186,   224,   220,    43,
         1158,  2470,   134,     1,    15, 15959,   570,   229,   186,
          224,   289,    49,    38,     7,     9,     3,    43,   744,
        11214,    45,    43,   367,     2,   352,   206,   250,   110,
      

In [38]:
# Split the data
x_train, x_val, y_train, y_val = train_test_split(x_data, y_data, test_size=0.2, random_state=42, stratify=y_data)


In [39]:
# Model architecture
input_layer = Input(shape=(max_len,))
embedding_layer = Embedding(max_words, 128, input_length=max_len)(input_layer)

# CNN layer for feature extraction
cnn_layer = Conv1D(filters=64, kernel_size=5, activation='relu')(embedding_layer)
cnn_layer = GlobalMaxPooling1D()(cnn_layer)

# BiLSTM layer for sequence learning
bilstm_layer = Bidirectional(LSTM(64, return_sequences=False))(embedding_layer)

# Concatenate CNN and BiLSTM outputs
concat_layer = tf.keras.layers.concatenate([cnn_layer, bilstm_layer])

# Dense layers with dropout for regularization
dense_layer = Dense(64, activation='relu')(concat_layer)
dense_layer = Dropout(0.5)(dense_layer)
output_layer = Dense(1, activation='sigmoid')(dense_layer)

# Build and compile the model
model = Model(inputs=input_layer, outputs=output_layer)
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# Model summary
model.summary()



In [40]:
# Train the model
batch_size = 32
epochs = 10
history = model.fit(
    x_train, y_train,
    epochs=epochs,
    batch_size=batch_size,
    validation_data=(x_val, y_val),
    verbose=1
)

Epoch 1/10
[1m5802/5802[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m973s[0m 167ms/step - accuracy: 0.9057 - loss: 0.2381 - val_accuracy: 0.9401 - val_loss: 0.1611
Epoch 2/10
[1m5802/5802[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m918s[0m 158ms/step - accuracy: 0.9546 - loss: 0.1285 - val_accuracy: 0.9440 - val_loss: 0.1523
Epoch 3/10
[1m5802/5802[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m925s[0m 159ms/step - accuracy: 0.9700 - loss: 0.0849 - val_accuracy: 0.9435 - val_loss: 0.1748
Epoch 4/10
[1m5802/5802[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m923s[0m 159ms/step - accuracy: 0.9817 - loss: 0.0513 - val_accuracy: 0.9392 - val_loss: 0.2276
Epoch 5/10
[1m5802/5802[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m923s[0m 159ms/step - accuracy: 0.9895 - loss: 0.0290 - val_accuracy: 0.9371 - val_loss: 0.2598
Epoch 6/10
[1m5802/5802[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m931s[0m 160ms/step - accuracy: 0.9928 - loss: 0.0201 - val_accuracy: 0.9370 - val_loss:

In [41]:
# Evaluation
y_pred = (model.predict(x_val) > 0.5).astype("int32")
print(classification_report(y_val, y_pred))

[1m1451/1451[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m57s[0m 39ms/step
              precision    recall  f1-score   support

           0       0.96      0.89      0.92     23287
           1       0.90      0.96      0.93     23128

    accuracy                           0.93     46415
   macro avg       0.93      0.93      0.93     46415
weighted avg       0.93      0.93      0.93     46415



In [61]:
from sklearn.metrics import accuracy_score, cohen_kappa_score

# Evaluate model on validation set to get loss and accuracy
loss, accuracy = model.evaluate(x_val, y_val, verbose=0)
print(f"Model Accuracy: {accuracy:.4f}")

# Predict on the validation set
y_val_pred = (model.predict(x_val) > 0.5).astype("int32").flatten()

# Calculate accuracy using sklearn
accuracy_sklearn = accuracy_score(y_val, y_val_pred)
print(f"Accuracy : {accuracy_sklearn:.4f}")

# Calculate Cohen's Kappa score
kappa_score = cohen_kappa_score(y_val, y_val_pred)
print(f"Cohen's Kappa Score: {kappa_score:.4f}")

Model Accuracy: 0.9269
[1m1451/1451[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m57s[0m 39ms/step
Accuracy (sklearn): 0.9269
Cohen's Kappa Score: 0.8538


In [42]:
# Save the model
model.save("cnn_bilstm_suicidal_ideation_model.h5")



In [59]:
# # Convert x_val to DataFrame and y_val to Series
# x_val_df = pd.DataFrame(x_val, columns=[f'feature_{i}' for i in range(x_val.shape[1])])
# y_val_df = pd.Series(y_val, name='class')

# # Concatenate x_val and y_val along the columns
# combined_df = pd.concat([x_val_df,  y_val_df], axis=1)

# # Display the resulting DataFrame
# print("Combined DataFrame:\n", combined_df)


# score = model.evaluate(x_val, verbose=0)
# print('Train loss:', score[0])
# print('Train accuracy:', score[1])