In [12]:
import pandas as pd
import numpy as np
import os

import tensorflow as tf
from tensorflow.keras.layers import Embedding, Conv1D, GlobalMaxPooling1D, Bidirectional, LSTM, Dense, Dropout, Input
from tensorflow.keras.models import Model
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score, cohen_kappa_score

In [13]:
# Load and preprocess data
# dataset_dir = os.path.join('..', 'Dataset')
# data_path = os.path.join(dataset_dir, 'Suicide_Detection.csv')

data = pd.read_csv('Suicide_Detection.csv')

data.head()

Unnamed: 0.1,Unnamed: 0,text,class
0,2,Ex Wife Threatening SuicideRecently I left my ...,suicide
1,3,Am I weird I don't get affected by compliments...,non-suicide
2,4,Finally 2020 is almost over... So I can never ...,non-suicide
3,8,i need helpjust help me im crying so hard,suicide
4,9,"I’m so lostHello, my name is Adam (16) and I’v...",suicide


In [14]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 232074 entries, 0 to 232073
Data columns (total 3 columns):
 #   Column      Non-Null Count   Dtype 
---  ------      --------------   ----- 
 0   Unnamed: 0  232074 non-null  int64 
 1   text        232074 non-null  object
 2   class       232074 non-null  object
dtypes: int64(1), object(2)
memory usage: 5.3+ MB


In [15]:
data['class'].value_counts()

Unnamed: 0_level_0,count
class,Unnamed: 1_level_1
suicide,116037
non-suicide,116037


In [16]:
# Find duplicates based on the 'Name' column only
duplicates_by_name = data[data.duplicated(subset=['text'])]
print("Duplicates based on Name column:\n", duplicates_by_name)

Duplicates based on Name column:
 Empty DataFrame
Columns: [Unnamed: 0, text, class]
Index: []


In [17]:
data['class'].replace({'suicide':1 , 'non-suicide':0} , inplace = True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  data['class'].replace({'suicide':1 , 'non-suicide':0} , inplace = True)
  data['class'].replace({'suicide':1 , 'non-suicide':0} , inplace = True)


In [18]:
texts = data['text'].values
labels = data['class'].values

In [19]:
# Tokenization and padding
max_words = 20000  # Adjust based on vocabulary size
max_len = 360 # 200  # Adjust based on average post length

tokenizer = Tokenizer(num_words=max_words)
tokenizer.fit_on_texts(texts)
sequences = tokenizer.texts_to_sequences(texts)
x_data = pad_sequences(sequences, maxlen=max_len)
y_data = np.array(labels)

In [20]:
# Split the data
x_train, x_val, y_train, y_val = train_test_split(x_data, y_data, test_size=0.4, random_state=42, stratify=y_data)

In [21]:
# Model architecture
input_layer = Input(shape=(max_len,))
embedding_layer = Embedding(max_words, 128, input_length=max_len)(input_layer)

# CNN layer for feature extraction
cnn_layer = Conv1D(filters=64, kernel_size=5, activation='relu')(embedding_layer)
cnn_layer = GlobalMaxPooling1D()(cnn_layer)

# BiLSTM layer for sequence learning
bilstm_layer = Bidirectional(LSTM(64, return_sequences=False))(embedding_layer)

# Concatenate CNN and BiLSTM outputs
concat_layer = tf.keras.layers.concatenate([cnn_layer, bilstm_layer])

# Dense layers with dropout for regularization
dense_layer = Dense(64, activation='relu')(concat_layer)
dense_layer = Dropout(0.5)(dense_layer)
output_layer = Dense(1, activation='sigmoid')(dense_layer)

# Build and compile the model
model = Model(inputs=input_layer, outputs=output_layer)
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# Model summary
model.summary()



In [22]:
# Train the model
batch_size = 512
epochs = 50
history = model.fit(
    x_train, y_train,
    epochs = epochs,
    batch_size = batch_size,
    validation_data = (x_val, y_val),
    verbose = 1
)

Epoch 1/50
[1m272/272[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m40s[0m 137ms/step - accuracy: 0.8300 - loss: 0.3802 - val_accuracy: 0.9374 - val_loss: 0.1701
Epoch 2/50
[1m272/272[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m39s[0m 130ms/step - accuracy: 0.9461 - loss: 0.1561 - val_accuracy: 0.9442 - val_loss: 0.1519
Epoch 3/50
[1m272/272[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m41s[0m 131ms/step - accuracy: 0.9584 - loss: 0.1228 - val_accuracy: 0.9431 - val_loss: 0.1551
Epoch 4/50
[1m272/272[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m35s[0m 130ms/step - accuracy: 0.9661 - loss: 0.0995 - val_accuracy: 0.9428 - val_loss: 0.1565
Epoch 5/50
[1m272/272[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m43s[0m 137ms/step - accuracy: 0.9741 - loss: 0.0783 - val_accuracy: 0.9375 - val_loss: 0.1737
Epoch 6/50
[1m272/272[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m39s[0m 129ms/step - accuracy: 0.9821 - loss: 0.0559 - val_accuracy: 0.9402 - val_loss: 0.1939
Epoch 7/50

In [23]:
# Evaluation

# Evaluate model on validation set to get loss and accuracy
loss, accuracy = model.evaluate(x_val, y_val, verbose=0)
print(f"Model Accuracy: {accuracy:.4f}")

# Predict on the validation set
y_val_pred = (model.predict(x_val) > 0.5).astype("int32").flatten()

# Calculate accuracy using sklearn
accuracy_sklearn = accuracy_score(y_val, y_val_pred)
print(f"Accuracy : {accuracy_sklearn:.4f}")

# Calculate Cohen's Kappa score
kappa_score = cohen_kappa_score(y_val, y_val_pred)
print(f"Cohen's Kappa Score: {kappa_score:.4f}")

y_pred = (model.predict(x_val) > 0.5).astype("int32")
print(classification_report(y_val, y_pred))

Model Accuracy: 0.9315
[1m2901/2901[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m32s[0m 11ms/step
Accuracy : 0.9315
Cohen's Kappa Score: 0.8631
[1m2901/2901[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m32s[0m 11ms/step
              precision    recall  f1-score   support

           0       0.94      0.92      0.93     46415
           1       0.93      0.94      0.93     46415

    accuracy                           0.93     92830
   macro avg       0.93      0.93      0.93     92830
weighted avg       0.93      0.93      0.93     92830



In [24]:
# Save the model
model.save("cnn_bilstm_suicidal_ideation_model.h5")



In [25]:
# Evaluation

# Evaluate model on validation set to get loss and accuracy
loss, accuracy = model.evaluate(x_train, y_train, verbose=0)
print(f"Model Accuracy: {accuracy:.4f}")

# Predict on the validation set
y_val_pred = (model.predict(x_train) > 0.5).astype("int32").flatten()

# Calculate accuracy using sklearn
accuracy_sklearn = accuracy_score(y_train, y_val_pred)
print(f"Accuracy : {accuracy_sklearn:.4f}")

# Calculate Cohen's Kappa score
kappa_score = cohen_kappa_score(y_train, y_val_pred)
print(f"Cohen's Kappa Score: {kappa_score:.4f}")

y_pred = (model.predict(x_train) > 0.5).astype("int32")
print(classification_report(y_train, y_pred))

Model Accuracy: 0.9994
[1m4352/4352[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m47s[0m 11ms/step
Accuracy : 0.9994
Cohen's Kappa Score: 0.9988
[1m4352/4352[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m47s[0m 11ms/step
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     69622
           1       1.00      1.00      1.00     69622

    accuracy                           1.00    139244
   macro avg       1.00      1.00      1.00    139244
weighted avg       1.00      1.00      1.00    139244



In [26]:
import numpy as np
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Conv1D, GlobalMaxPooling1D, Dense, LSTM, Bidirectional
from tensorflow.keras.optimizers import Adam
from sklearn.model_selection import KFold
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, cohen_kappa_score, classification_report
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences


In [28]:
# Load and preprocess data
# dataset_dir = os.path.join('..', 'Dataset')
# data_path = os.path.join(dataset_dir, 'Suicide_Detection.csv')

data = pd.read_csv('Suicide_Detection.csv')
data['class'].replace({'suicide':1 , 'non-suicide':0} , inplace = True)
data.head()

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  data['class'].replace({'suicide':1 , 'non-suicide':0} , inplace = True)
  data['class'].replace({'suicide':1 , 'non-suicide':0} , inplace = True)


Unnamed: 0.1,Unnamed: 0,text,class
0,2,Ex Wife Threatening SuicideRecently I left my ...,1
1,3,Am I weird I don't get affected by compliments...,0
2,4,Finally 2020 is almost over... So I can never ...,0
3,8,i need helpjust help me im crying so hard,1
4,9,"I’m so lostHello, my name is Adam (16) and I’v...",1


In [29]:
texts = data['text'].values
labels = data['class'].values

In [35]:
# Parameters
max_words = 5000
max_len = 100
embedding_dim = 128
k_folds = 10

# Tokenizing and padding the text data
tokenizer = Tokenizer(num_words=max_words)
tokenizer.fit_on_texts(texts)
X = tokenizer.texts_to_sequences(texts)
X = pad_sequences(X, maxlen=max_len)
y = np.array(labels)

# CNN-BiLSTM Model Definition
def build_cnn_bilstm_model():
    # model = Sequential([
    #     Embedding(input_dim=max_words, output_dim=embedding_dim, input_length=max_len),
    #     Conv1D(64, kernel_size=5, activation='relu'),
    #     GlobalMaxPooling1D(),
    #     Bidirectional(LSTM(64)),
    #     Dense(1, activation='sigmoid')
    # ])
    # model.compile(optimizer=Adam(), loss='binary_crossentropy', metrics=['accuracy'])
    # return model

    model = Sequential([
        Embedding(input_dim=max_words, output_dim=embedding_dim, input_length=max_len),
        Conv1D(64, kernel_size=5, activation='relu'),
        Bidirectional(LSTM(64)),
        Dense(1, activation='sigmoid')
    ])
    model.compile(optimizer=Adam(), loss='binary_crossentropy', metrics=['accuracy'])
    return model

In [36]:
# K-Fold Cross Validation
kf = KFold(n_splits=k_folds, shuffle=True, random_state=42)
fold_no = 1

# Metrics storage
accuracy_scores = []
f1_scores = []
precision_scores = []
recall_scores = []
kappa_scores = []

for train_index, val_index in kf.split(X):
    print(f"\nFold {fold_no}:")

    # Split data into train and validation sets for this fold
    X_train, X_val = X[train_index], X[val_index]
    y_train, y_val = y[train_index], y[val_index]

    # Build the model
    model = build_cnn_bilstm_model()

    # Train the model
    model.fit(X_train, y_train, epochs=3, batch_size=32, verbose=1)

    # Predict on the validation set
    y_val_pred = (model.predict(X_val) > 0.5).astype("int32").flatten()

    # Calculate metrics for this fold
    acc = accuracy_score(y_val, y_val_pred)
    f1 = f1_score(y_val, y_val_pred)
    precision = precision_score(y_val, y_val_pred)
    recall = recall_score(y_val, y_val_pred)
    kappa = cohen_kappa_score(y_val, y_val_pred)

    # Append metrics to lists
    accuracy_scores.append(acc)
    f1_scores.append(f1)
    precision_scores.append(precision)
    recall_scores.append(recall)
    kappa_scores.append(kappa)

    print(f"Accuracy: {acc:.4f}, F1 Score: {f1:.4f}, Precision: {precision:.4f}, Recall: {recall:.4f}, Kappa: {kappa:.4f}")
    fold_no += 1

# Calculate and display the mean and standard deviation of each metric across all folds
print("\n--- Cross-Validation Results ---")
print(f"Mean Accuracy: {np.mean(accuracy_scores):.4f} ± {np.std(accuracy_scores):.4f}")
print(f"Mean F1 Score: {np.mean(f1_scores):.4f} ± {np.std(f1_scores):.4f}")
print(f"Mean Precision: {np.mean(precision_scores):.4f} ± {np.std(precision_scores):.4f}")
print(f"Mean Recall: {np.mean(recall_scores):.4f} ± {np.std(recall_scores):.4f}")
print(f"Mean Kappa: {np.mean(kappa_scores):.4f} ± {np.std(kappa_scores):.4f}")


Fold 1:
Epoch 1/3
[1m6528/6528[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m78s[0m 11ms/step - accuracy: 0.9033 - loss: 0.2505
Epoch 2/3
[1m6528/6528[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m82s[0m 11ms/step - accuracy: 0.9382 - loss: 0.1671
Epoch 3/3
[1m6528/6528[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m81s[0m 11ms/step - accuracy: 0.9473 - loss: 0.1447
[1m726/726[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 4ms/step
Accuracy: 0.9345, F1 Score: 0.9340, Precision: 0.9345, Recall: 0.9334, Kappa: 0.8691

Fold 2:
Epoch 1/3




[1m6528/6528[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m78s[0m 12ms/step - accuracy: 0.9040 - loss: 0.2431
Epoch 2/3
[1m6528/6528[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m82s[0m 12ms/step - accuracy: 0.9374 - loss: 0.1678
Epoch 3/3
[1m6528/6528[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m82s[0m 12ms/step - accuracy: 0.9483 - loss: 0.1413
[1m726/726[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 5ms/step
Accuracy: 0.9336, F1 Score: 0.9337, Precision: 0.9338, Recall: 0.9336, Kappa: 0.8673

Fold 3:




Epoch 1/3
[1m6528/6528[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m77s[0m 11ms/step - accuracy: 0.9025 - loss: 0.2495
Epoch 2/3
[1m6528/6528[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m83s[0m 12ms/step - accuracy: 0.9390 - loss: 0.1645
Epoch 3/3
[1m6528/6528[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m81s[0m 12ms/step - accuracy: 0.9484 - loss: 0.1406
[1m726/726[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 6ms/step
Accuracy: 0.9356, F1 Score: 0.9364, Precision: 0.9300, Recall: 0.9430, Kappa: 0.8712

Fold 4:
Epoch 1/3




[1m6528/6528[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m78s[0m 12ms/step - accuracy: 0.9012 - loss: 0.2478
Epoch 2/3
[1m6528/6528[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m85s[0m 12ms/step - accuracy: 0.9365 - loss: 0.1695
Epoch 3/3
[1m6528/6528[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m79s[0m 12ms/step - accuracy: 0.9447 - loss: 0.1491
[1m726/726[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 4ms/step
Accuracy: 0.9342, F1 Score: 0.9352, Precision: 0.9194, Recall: 0.9515, Kappa: 0.8684

Fold 5:
Epoch 1/3




[1m6528/6528[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m80s[0m 12ms/step - accuracy: 0.9029 - loss: 0.2492
Epoch 2/3
[1m6528/6528[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m81s[0m 12ms/step - accuracy: 0.9360 - loss: 0.1697
Epoch 3/3
[1m6528/6528[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m82s[0m 12ms/step - accuracy: 0.9469 - loss: 0.1444
[1m726/726[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 5ms/step
Accuracy: 0.9382, F1 Score: 0.9382, Precision: 0.9343, Recall: 0.9422, Kappa: 0.8763

Fold 6:
Epoch 1/3




[1m6528/6528[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m77s[0m 11ms/step - accuracy: 0.9011 - loss: 0.2506
Epoch 2/3
[1m6528/6528[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m76s[0m 12ms/step - accuracy: 0.9362 - loss: 0.1712
Epoch 3/3
[1m6528/6528[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m84s[0m 12ms/step - accuracy: 0.9459 - loss: 0.1451
[1m726/726[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 4ms/step
Accuracy: 0.9319, F1 Score: 0.9328, Precision: 0.9248, Recall: 0.9410, Kappa: 0.8638

Fold 7:




Epoch 1/3
[1m6528/6528[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m77s[0m 11ms/step - accuracy: 0.9050 - loss: 0.2442
Epoch 2/3
[1m6528/6528[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m76s[0m 12ms/step - accuracy: 0.9374 - loss: 0.1690
Epoch 3/3
[1m6528/6528[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m83s[0m 12ms/step - accuracy: 0.9461 - loss: 0.1489
[1m726/726[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 5ms/step
Accuracy: 0.9358, F1 Score: 0.9355, Precision: 0.9476, Recall: 0.9237, Kappa: 0.8715

Fold 8:
Epoch 1/3




[1m6528/6528[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m79s[0m 12ms/step - accuracy: 0.9042 - loss: 0.2462
Epoch 2/3
[1m6528/6528[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m77s[0m 12ms/step - accuracy: 0.9365 - loss: 0.1694
Epoch 3/3
[1m6528/6528[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m80s[0m 12ms/step - accuracy: 0.9463 - loss: 0.1464
[1m726/726[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 4ms/step
Accuracy: 0.9333, F1 Score: 0.9342, Precision: 0.9237, Recall: 0.9450, Kappa: 0.8666

Fold 9:
Epoch 1/3




[1m6528/6528[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m78s[0m 12ms/step - accuracy: 0.9011 - loss: 0.2510
Epoch 2/3
[1m6528/6528[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m74s[0m 11ms/step - accuracy: 0.9349 - loss: 0.1732
Epoch 3/3
[1m6528/6528[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m82s[0m 11ms/step - accuracy: 0.9466 - loss: 0.1445
[1m726/726[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 4ms/step
Accuracy: 0.9336, F1 Score: 0.9320, Precision: 0.9437, Recall: 0.9205, Kappa: 0.8671

Fold 10:
Epoch 1/3




[1m6528/6528[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m78s[0m 12ms/step - accuracy: 0.9040 - loss: 0.2466
Epoch 2/3
[1m6528/6528[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m82s[0m 11ms/step - accuracy: 0.9368 - loss: 0.1670
Epoch 3/3
[1m6528/6528[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m82s[0m 12ms/step - accuracy: 0.9469 - loss: 0.1441
[1m726/726[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 6ms/step
Accuracy: 0.9369, F1 Score: 0.9377, Precision: 0.9286, Recall: 0.9471, Kappa: 0.8738

--- Cross-Validation Results ---
Mean Accuracy: 0.9348 ± 0.0018
Mean F1 Score: 0.9350 ± 0.0019
Mean Precision: 0.9320 ± 0.0083
Mean Recall: 0.9381 ± 0.0096
Mean Kappa: 0.8695 ± 0.0035
