In [None]:
import pandas as pd
import re
import numpy as np
import pickle
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, log_loss
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from imblearn.over_sampling import SMOTE
from google.colab import drive
import nltk
nltk.download('stopwords')
nltk.download('wordnet')
from google.colab import files

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


In [None]:
# Mount Google Drive
from google.colab import drive
drive.mount('/content/drive')

# Load dataset (Upload manually if needed)
file_path = "/content/drive/My Drive/Suicide_Detection.csv"
try:
    df = pd.read_csv(file_path, encoding="utf-8")
    print("✅ Dataset loaded successfully from Google Drive.")
except FileNotFoundError:
    print("⚠️ File not found. Please upload manually.")
    uploaded = files.upload()
    df = pd.read_csv(next(iter(uploaded)))
    print("✅ Dataset loaded successfully from upload.")

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
✅ Dataset loaded successfully from Google Drive.


In [None]:
# Keep only relevant columns
df = df.iloc[:, [1, 2]]
df.columns = ['text', 'label']
df.dropna(inplace=True)

# Convert labels to binary format
df['label'] = df['label'].map({'suicide': 1, 'non-suicide': 0})
print("✅ Labels converted to binary format.")

# Initialize NLP tools
stop_words = set(stopwords.words("english"))
lemmatizer = WordNetLemmatizer()

def clean_text(text):
    if not isinstance(text, str):
        return ""
    text = text.lower()
    text = re.sub(r'http\S+|www\S+', '', text)  # Remove URLs
    text = re.sub(r'@\w+|#\w+', '', text)  # Remove mentions & hashtags
    text = re.sub(r'[^a-z\s]', '', text)  # Remove special characters & punctuation
    text = " ".join([lemmatizer.lemmatize(word) for word in text.split() if word not in stop_words])
    return text.strip()

# Apply text preprocessing
df['text'].fillna("", inplace=True)
df['clean_text'] = df['text'].apply(clean_text)
print("✅ Text cleaning completed.")

# TF-IDF Vectorization
vectorizer = TfidfVectorizer(max_features=2000)
X = vectorizer.fit_transform(df['clean_text'])
y = df['label']
print(f"✅ TF-IDF feature extraction completed. Shape: {X.shape}")

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)
print("✅ Dataset split into training and testing sets.")

✅ Labels converted to binary format.


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['text'].fillna("", inplace=True)


✅ Text cleaning completed.
✅ TF-IDF feature extraction completed. Shape: (232074, 2000)
✅ Dataset split into training and testing sets.


In [None]:
# Train SVM model (Linear kernel for speed)
model = SVC(kernel='linear', class_weight='balanced')
print("Training SVM model...")
model.fit(X_train, y_train)
print("SVM model trained successfully.")

# Predict on test data
y_pred = model.predict(X_test)

# Evaluate the model
test_accuracy = accuracy_score(y_test, y_pred)
print(f"\n Test Accuracy: {test_accuracy:.4f}")
print("\n Classification Report:")
print(classification_report(y_test, y_pred))
print("\n Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))


Training SVM model...
SVM model trained successfully.

 Test Accuracy: 0.9279

 Classification Report:
              precision    recall  f1-score   support

           0       0.92      0.94      0.93     23208
           1       0.94      0.92      0.93     23207

    accuracy                           0.93     46415
   macro avg       0.93      0.93      0.93     46415
weighted avg       0.93      0.93      0.93     46415


 Confusion Matrix:
[[21740  1468]
 [ 1878 21329]]
