In [1]:
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import precision_score, recall_score, f1_score
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import re
import matplotlib.pyplot as plt
import numpy as np
import os

In [2]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

device(type='cuda')

In [3]:
# Download stopwords
nltk.download('punkt')
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

[nltk_data] Downloading package punkt to
[nltk_data]     /teamspace/studios/this_studio/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /teamspace/studios/this_studio/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [4]:
# Load dataset
data = pd.read_csv('Suicide_Detection.csv')

In [5]:
# Preprocessing function
def preprocess_text(text):
    if isinstance(text, str):
        text = text.lower()
        text = re.sub(r'@\w+', '', text)
        text = re.sub(r'#\w+', '', text)
        text = re.sub(r'http\S+', '', text)
        text = re.sub(r'[^a-z\s]', '', text)
        # text = ' '.join(word for word in word_tokenize(text) if word not in stop_words)
        text = ' '.join(word for word in text.split() if word not in stop_words)
        return text
    return ''

In [6]:
# Apply preprocessing
data['cleaned_text'] = data['Tweet'].apply(preprocess_text)

In [7]:
# Encode labels
label_encoder = LabelEncoder()
data['Suicide'] = label_encoder.fit_transform(data['Suicide'])

In [9]:
# Split dataset
X_train, X_test, y_train, y_test = train_test_split(
    data['cleaned_text'], data['Suicide'], test_size=0.2, random_state=42
)

In [10]:
# Vectorization with TF-IDF
vectorizer = TfidfVectorizer(max_features=10000)
X_train_vectorized = vectorizer.fit_transform(X_train).toarray()
X_test_vectorized = vectorizer.transform(X_test).toarray()

In [12]:
# Convert to NumPy arrays (sparse to dense conversion is efficient in batches)
X_train_np = X_train_vectorized.astype(np.float32)
X_test_np = X_test_vectorized.astype(np.float32)
y_train_np = np.array(y_train.values, dtype=np.int64)
y_test_np = np.array(y_test.values, dtype=np.int64)

In [None]:
# X_train_np = np.array(X_train_vectorized, dtype=np.float32)
# y_train_np = np.array(y_train.values, dtype=np.int64)
# X_test_np = np.array(X_test_vectorized, dtype=np.float32)
# y_test_np = np.array(y_test.values, dtype=np.int64)

In [13]:
X_train_tensor = torch.tensor(X_train_np).to(device)
y_train_tensor = torch.tensor(y_train_np).to(device)
X_test_tensor = torch.tensor(X_test_np).to(device)
y_test_tensor = torch.tensor(y_test_np).to(device)

In [14]:
# Use DataLoader for efficient batch processing
train_dataset = TensorDataset(X_train_tensor, y_train_tensor)
test_dataset = TensorDataset(X_test_tensor, y_test_tensor)

train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=64, shuffle=False)

In [22]:
# Check Dataloader Length
print(f"Number of Batches in Train DataLoader: {len(train_loader)}")
print(f"Batch Size: {train_loader.batch_size}")

Number of Batches in Train DataLoader: 2901
Batch Size: 64


In [15]:
# Define LSTM Model
class LSTMClassifier(nn.Module):
    def __init__(self, input_size, hidden_size, num_classes, dropout_rate=0.5):
        super(LSTMClassifier, self).__init__()
        self.fc1 = nn.Linear(input_size, 128)
        self.lstm = nn.LSTM(input_size=128, hidden_size=hidden_size, batch_first=True)
        self.dropout = nn.Dropout(dropout_rate)
        self.fc2 = nn.Linear(hidden_size, num_classes)
    
    def forward(self, x):
        x = torch.relu(self.fc1(x))
        x, (hn, _) = self.lstm(x.unsqueeze(1))
        x = self.dropout(hn[-1])
        x = self.fc2(x)
        return x

In [16]:
# Model initialization
input_size = X_train_tensor.shape[1]
print(input_size)
hidden_size = 64
num_classes = 2
model = LSTMClassifier(input_size, hidden_size, num_classes).to(device)

10000


In [17]:
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)
scaler = torch.cuda.amp.GradScaler()

In [18]:
# Training loop with DataLoader and Gradient Scaling
num_epochs = 25
training_loss = []
training_acc = []

In [19]:
os.makedirs('checkpoints', exist_ok=True)

In [23]:
for epoch in range(num_epochs):
    model.train()
    epoch_loss = 0
    correct = 0
    total = 0
    
    for X_batch, y_batch in train_loader:
        optimizer.zero_grad()
        with torch.cuda.amp.autocast():
            outputs = model(X_batch)
            loss = criterion(outputs, y_batch)
        
        scaler.scale(loss).backward()
        scaler.step(optimizer)
        scaler.update()
        
        epoch_loss += loss.item()
        _, predicted = torch.max(outputs.data, 1)
        total += y_batch.size(0)
        correct += (predicted == y_batch).sum().item()
    
    acc = correct / total
    print(f"{epoch_loss}\t{len(train_loader)}\t{correct}\t{total}")
    training_loss.append(epoch_loss / len(train_loader))
    training_acc.append(acc)
    
    print(f'Epoch [{epoch+1}/{num_epochs}], Loss: {epoch_loss:.4f}, Accuracy: {acc*100:.2f}%')

6.029979195287922	2901	185497	185659
Epoch [1/25], Loss: 6.0300, Accuracy: 99.91%
5.236249476134525	2901	185502	185659
Epoch [2/25], Loss: 5.2362, Accuracy: 99.92%
4.728542295679176	2901	185507	185659
Epoch [3/25], Loss: 4.7285, Accuracy: 99.92%
4.854543195279803	2901	185507	185659
Epoch [4/25], Loss: 4.8545, Accuracy: 99.92%
4.867768667638302	2901	185510	185659
Epoch [5/25], Loss: 4.8678, Accuracy: 99.92%
4.982813546591899	2901	185509	185659
Epoch [6/25], Loss: 4.9828, Accuracy: 99.92%
4.81091943648407	2901	185510	185659
Epoch [7/25], Loss: 4.8109, Accuracy: 99.92%
5.99258941039443	2901	185493	185659
Epoch [8/25], Loss: 5.9926, Accuracy: 99.91%
5.197805741265142	2901	185509	185659
Epoch [9/25], Loss: 5.1978, Accuracy: 99.92%
4.71133917431205	2901	185506	185659
Epoch [10/25], Loss: 4.7113, Accuracy: 99.92%
4.713211715919897	2901	185509	185659
Epoch [11/25], Loss: 4.7132, Accuracy: 99.92%
4.893956515054075	2901	185502	185659
Epoch [12/25], Loss: 4.8940, Accuracy: 99.92%
5.43750646928231

In [25]:
with torch.no_grad():
    sample_output = model(X_train_tensor[:5])
    print("Sample Output (Raw Logits):", sample_output)
    print("Target Labels:", y_train_tensor[:5])

Sample Output (Raw Logits): tensor([[-25.3217,  25.0970],
        [ 17.2560, -17.6153],
        [-29.0649,  29.2397],
        [ 32.7077, -33.5828],
        [ 29.6142, -30.1499]], device='cuda:0')
Target Labels: tensor([1, 0, 1, 0, 0], device='cuda:0')
