In [8]:
import pandas as pd
import numpy as np

# Visualisation
import matplotlib.pyplot as plt
import seaborn as sns
import statistics
#import plotly as px

# Train test split
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedKFold

from sklearn import metrics
from sklearn.metrics import roc_auc_score, roc_curve
from sklearn.pipeline import Pipeline

from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression

# Word2Vec Embedding
import gensim
from gensim.models import Word2Vec
from sklearn.utils import resample


from gensim.models import FastText
from gensim.models import Doc2Vec
from gensim.models.doc2vec import TaggedDocument
from gensim.models import KeyedVectors

from keras.utils import pad_sequences
from tensorflow.keras.preprocessing.text import Tokenizer 
from tensorflow.keras.preprocessing.sequence import pad_sequences

# ELMo Embedding
import tensorflow as tf
import tensorflow_hub as hub
import h5py

# Hyperparameter tuning
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
from sklearn.base import BaseEstimator
from sklearn.metrics import make_scorer, accuracy_score, f1_score, recall_score, precision_score
from sklearn.metrics import confusion_matrix
from scipy.stats import uniform  
from sklearn.metrics import classification_report
from sklearn.ensemble import RandomForestClassifier

import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report
from nltk.sentiment import SentimentIntensityAnalyzer
from sklearn.svm import SVC
import numpy as np
import joblib

from sklearn.preprocessing import LabelEncoder

In [4]:
# Load the data
data = pd.read_csv('cleaned_suicide_data.csv')
data.dropna(inplace=True)
data.head()

Unnamed: 0.1,Unnamed: 0,lemmatized_text,class,pos_tags
0,0,ex wife threaten suicide recently leave wife g...,suicide,"[('ex', 'NOUN'), ('wife', 'NOUN'), ('threateni..."
1,1,weird get affect comp li ment come someone kno...,non-suicide,"[('weird', 'ADJ'), ('get', 'AUX'), ('affected'..."
2,2,finally almost never hear bad year ever swear ...,non-suicide,"[('finally', 'ADV'), ('almost', 'ADV'), ('neve..."
3,3,need help help cry hard,suicide,"[('need', 'VERB'), ('help', 'NOUN'), ('help', ..."
4,4,lose hello name adam struggle year afraid past...,suicide,"[('lost', 'VERB'), ('hello', 'INTJ'), ('name',..."


In [5]:
data.isnull().sum()

Unnamed: 0         0
lemmatized_text    0
class              0
pos_tags           0
dtype: int64

In [10]:


# Additional feature extraction: Sentiment analysis
sia = SentimentIntensityAnalyzer()

# Check for NaN values in 'lemmatized_text'
data['sentiment_score'] = data['lemmatized_text'].apply(lambda x: sia.polarity_scores(str(x))['compound'] if pd.notnull(x) else np.nan)

# Drop rows with NaN values in 'sentiment_score'
data = data.dropna(subset=['sentiment_score'])

# Combine features
X_text = data['lemmatized_text']
X_sentiment = np.array(data['sentiment_score']).reshape(-1, 1)

# Train-test split (60% train, 20% fine-tuning, 20% comparison)
X_train_text, X_compare_text, X_train_sentiment, X_compare_sentiment, y_train, y_compare = train_test_split(X_text, X_sentiment, data['class'], test_size=0.4, random_state=42)

# Further split the remaining data for comparison into fine-tuning and comparison
X_fine_tune_text, X_compare_text, X_fine_tune_sentiment, X_compare_sentiment, y_fine_tune, y_compare = train_test_split(X_compare_text, X_compare_sentiment, y_compare, test_size=0.5, random_state=42)

# Feature extraction: TF-IDF vectorization
vectorizer = TfidfVectorizer(max_features=1000)
X_train_text_tfidf = vectorizer.fit_transform(X_train_text)
X_compare_text_tfidf = vectorizer.transform(X_compare_text)
X_fine_tune_text_tfidf = vectorizer.transform(X_fine_tune_text)

# Combine TF-IDF features with additional features
X_train = np.hstack((X_train_text_tfidf.toarray(), X_train_sentiment))
X_compare = np.hstack((X_compare_text_tfidf.toarray(), X_compare_sentiment))
X_fine_tune = np.hstack((X_fine_tune_text_tfidf.toarray(), X_fine_tune_sentiment))

# Model selection and training for Random Forest
random_forest_model = RandomForestClassifier(n_estimators=100, random_state=42)
random_forest_model.fit(X_train, y_train)

# Save Random Forest model
joblib.dump(random_forest_model, 'random_forest_model.joblib')

# Model evaluation on comparison set
y_compare_pred_rf = random_forest_model.predict(X_compare)
print("Random Forest Comparison Set Classification Report:")
print(classification_report(y_compare, y_compare_pred_rf))

# Model evaluation on fine-tuning set
y_fine_tune_pred_rf = random_forest_model.predict(X_fine_tune)
print("Random Forest Fine-Tuning Set Classification Report:")
print(classification_report(y_fine_tune, y_fine_tune_pred_rf))

# Fine-tuning Random Forest
# Define hyperparameters to tune
param_grid_rf = {
    'n_estimators': [50, 100],
    'max_depth': [10, 20],
    'min_samples_split': [2, 5],
    'min_samples_leaf': [1, 2]
}

# Grid search for best parameters
grid_search_rf = GridSearchCV(estimator=random_forest_model, param_grid=param_grid_rf, cv=5, scoring='accuracy', n_jobs=-1)
grid_search_rf.fit(X_fine_tune, y_fine_tune)

# Get the best parameters
best_params_rf = grid_search_rf.best_params_
print("Best Parameters for Random Forest:", best_params_rf)

# Train Random Forest with the best parameters
random_forest_model_best = RandomForestClassifier(**best_params_rf, random_state=42)
random_forest_model_best.fit(X_train, y_train)

# Model evaluation on comparison set with fine-tuned Random Forest
y_compare_pred_rf_best = random_forest_model_best.predict(X_compare)
print("\nRandom Forest (Fine-Tuned) Comparison Set Classification Report:")
print(classification_report(y_compare, y_compare_pred_rf_best))

# Model evaluation on fine-tuning set with fine-tuned Random Forest
y_fine_tune_pred_rf_best = random_forest_model_best.predict(X_fine_tune)
print("Random Forest (Fine-Tuned) Fine-Tuning Set Classification Report:")
print(classification_report(y_fine_tune, y_fine_tune_pred_rf_best))

# Combine TF-IDF features with additional features for Naive Bayes
X_train_nb = np.hstack((X_train_text_tfidf.toarray(), X_train_sentiment + 1))  # Add 1 to ensure non-negative values
X_compare_nb = np.hstack((X_compare_text_tfidf.toarray(), X_compare_sentiment + 1))
X_fine_tune_nb = np.hstack((X_fine_tune_text_tfidf.toarray(), X_fine_tune_sentiment + 1))

# Model selection and training for Naive Bayes
naive_bayes_model = MultinomialNB()
naive_bayes_model.fit(X_train_nb, y_train)

# Save Naive Bayes model
joblib.dump(naive_bayes_model, 'naive_bayes_model.joblib')

# Model evaluation on comparison set with Naive Bayes
y_compare_pred_nb = naive_bayes_model.predict(X_compare_nb)
print("\nNaive Bayes Comparison Set Classification Report:")
print(classification_report(y_compare, y_compare_pred_nb))

# Model evaluation on fine-tuning set with Naive Bayes
y_fine_tune_pred_nb = naive_bayes_model.predict(X_fine_tune_nb)
print("Naive Bayes Fine-Tuning Set Classification Report:")
print(classification_report(y_fine_tune, y_fine_tune_pred_nb))

# Fine-tuning Naive Bayes
# Define hyperparameters to tune
param_grid_nb = {
    'alpha': [0.1, 0.5, 1.0, 2.0]  # Add more values for alpha if needed
}

# Grid search for best parameters
grid_search_nb = GridSearchCV(estimator=MultinomialNB(), param_grid=param_grid_nb, cv=5, scoring='accuracy', verbose=1, n_jobs=-1)
grid_search_nb.fit(X_fine_tune_nb, y_fine_tune)

# Get the best hyperparameters
best_params_nb = grid_search_nb.best_params_
print("Best Hyperparameters for Naive Bayes:", best_params_nb)

# Retrain the model using the best hyperparameters
best_nb_model = MultinomialNB(alpha=best_params_nb['alpha'])
best_nb_model.fit(X_train_nb, y_train)

# Save the best Naive Bayes model
joblib.dump(best_nb_model, 'best_naive_bayes_model.joblib')

# Model evaluation on comparison set using the best model
y_pred_best_nb = best_nb_model.predict(X_compare_nb)
print("\nBest Naive Bayes Comparison Set Classification Report:")
print(classification_report(y_compare, y_pred_best_nb))

# Model evaluation on fine-tuning set using the best model
y_pred_best_nb_fine_tune = best_nb_model.predict(X_fine_tune_nb)
print("Best Naive Bayes Fine-Tuning Set Classification Report:")
print(classification_report(y_fine_tune, y_pred_best_nb_fine_tune))

# Model selection and training for SVM
svm_model = SVC(kernel='linear')
svm_model.fit(X_train, y_train)

# Save SVM model
joblib.dump(svm_model, 'svm_model.joblib')

# Model evaluation on comparison set with SVM
y_compare_pred_svm = svm_model.predict(X_compare)
print("\nSVM Comparison Set Classification Report:")
print(classification_report(y_compare, y_compare_pred_svm))

# Model evaluation on fine-tuning set with SVM
y_fine_tune_pred_svm = svm_model.predict(X_fine_tune)
print("SVM Fine-Tuning Set Classification Report:")
print(classification_report(y_fine_tune, y_fine_tune_pred_svm))

# Fine-tuning SVM
# Define hyperparameters to tune
param_grid_svm = {
    'C': [0.1, 1, 10],  # Reduce the number of values
    'kernel': ['linear', 'rbf'],  # Limit the kernel choices
    'gamma': ['scale', 'auto']
}

# Randomized search for best parameters
random_search_svm = RandomizedSearchCV(estimator=svm_model, param_distributions=param_grid_svm, n_iter=10, cv=3, scoring='accuracy', n_jobs=-1)
random_search_svm.fit(X_fine_tune, y_fine_tune)

# Get the best parameters
best_params_svm = random_search_svm.best_params_
print("Best Parameters for SVM:", best_params_svm)

# Train SVM with the best parameters
svm_model_best = SVC(**best_params_svm)
svm_model_best.fit(X_train, y_train)

# Model evaluation on comparison set with fine-tuned SVM
y_compare_pred_svm_best = svm_model_best.predict(X_compare)
print("\nSVM (Fine-Tuned) Comparison Set Classification Report:")
print(classification_report(y_compare, y_compare_pred_svm_best))

# Model evaluation on fine-tuning set with fine-tuned SVM
y_fine_tune_pred_svm_best = svm_model_best.predict(X_fine_tune)
print("SVM (Fine-Tuned) Fine-Tuning Set Classification Report:")
print(classification_report(y_fine_tune, y_fine_tune_pred_svm_best))


Random Forest Comparison Set Classification Report:
              precision    recall  f1-score   support

 non-suicide       0.91      0.89      0.90     23308
     suicide       0.89      0.91      0.90     23088

    accuracy                           0.90     46396
   macro avg       0.90      0.90      0.90     46396
weighted avg       0.90      0.90      0.90     46396

Random Forest Fine-Tuning Set Classification Report:
              precision    recall  f1-score   support

 non-suicide       0.91      0.90      0.90     23195
     suicide       0.90      0.91      0.91     23201

    accuracy                           0.90     46396
   macro avg       0.90      0.90      0.90     46396
weighted avg       0.90      0.90      0.90     46396

Best Parameters for Random Forest: {'max_depth': 20, 'min_samples_leaf': 1, 'min_samples_split': 5, 'n_estimators': 100}

Random Forest (Fine-Tuned) Comparison Set Classification Report:
              precision    recall  f1-score   support


In [14]:
sentences = [
    "I can't go on like this anymore. The pain is unbearable.",
    "It feels like there's no way out of this darkness. I'm so tired of fighting.",
    "I don't see any reason to keep living. Everything just hurts too much.",
    "I feel like I'm a burden to everyone around me. Maybe it's better if I just disappear.",
    "I've been thinking a lot about ending it all. Maybe that's the only way to find peace.",
    "Nobody would even miss me if I were gone. I don't belong here.",
    "The world would be better off without me. I'm just a waste of space.",
    "I can't shake this feeling of hopelessness. Maybe death is the only way to escape.",
    "I've thought about how I would do it. Maybe tonight's the night.",
    "I'm so alone. I don't think anyone understands how much pain I'm in.",
    "I want to live"
]

# Preprocess and vectorize the sentences using the TF-IDF vectorizer for Naive Bayes
sentences_tfidf = vectorizer.transform(sentences)
sentiments = [sia.polarity_scores(sentence)['compound'] for sentence in sentences]

# Combine TF-IDF features with sentiment scores for Random Forest
X_sentences = np.hstack((sentences_tfidf.toarray(), np.array(sentiments).reshape(-1, 1) + 1))



# Use Random Forest model for prediction
predictions_rf = random_forest_model_best.predict(X_sentences)
print("\nRandom Forest Predictions:")
print(predictions_rf)

# Use NB for prediction
predictions_nb = best_nb_model.predict(X_sentences)
print("\nNaive Bayes Predictions:")
print(predictions_nb)

# Use SVM model for prediction
predictions_svm = svm_model_best.predict(X_sentences)
print("\nSVM Predictions:")
print(predictions_svm)






Random Forest Predictions:
['non-suicide' 'non-suicide' 'non-suicide' 'non-suicide' 'non-suicide'
 'non-suicide' 'non-suicide' 'non-suicide' 'non-suicide' 'non-suicide'
 'non-suicide']

Naive Bayes Predictions:
['suicide' 'suicide' 'suicide' 'suicide' 'suicide' 'suicide' 'suicide'
 'suicide' 'suicide' 'suicide' 'non-suicide']

SVM Predictions:
['suicide' 'non-suicide' 'suicide' 'suicide' 'non-suicide' 'non-suicide'
 'non-suicide' 'suicide' 'suicide' 'suicide' 'non-suicide']


In [15]:
# Save SVM-best model
joblib.dump(svm_model_best, 'svm_model_best.joblib')

# Save Random Forest Best model
joblib.dump(random_forest_model_best, 'random_forest_model_best.joblib')

['random_forest_model_best.joblib']