In [None]:
import os
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from torchtext.legacy.data import Field, TabularDataset, BucketIterator

In [None]:
data = pd.read_csv("/kaggle/input/suicide-tweet/Suicide_Detection.csv")
data['label'] = data['Suicide'].map({'suicide': 1, 'non-suicide': 0})

In [None]:
# Drop rows with invalid or missing labels
data = data.dropna(subset=['Tweet', 'label'])

In [None]:
# Preprocess tweets: lowercasing, removing special characters
data['Tweet'] = data['Tweet'].str.lower().str.replace(r"[^a-zA-Z\s]", "", regex=True)

In [None]:
# Train-validation split
train_texts, val_texts, train_labels, val_labels = train_test_split(
    data['Tweet'], data['label'], test_size=0.2, random_state=42
)

In [None]:
tokenizer = get_tokenizer("basic_english")

In [None]:
def yield_tokens(data_iter):
    for text in data_iter:
        yield tokenizer(text)

In [None]:
vocab = build_vocab_from_iterator(yield_tokens(train_texts), specials=["<unk>"])
vocab.set_default_index(vocab["<unk>"])

In [None]:
text_transform = lambda x: [vocab[token] for token in tokenizer(x)]
label_transform = lambda x: int(x)

In [None]:
class TextDataset(Dataset):
    def __init__(self, texts, labels, vocab, tokenizer):
        self.texts = texts
        self.labels = labels
        self.vocab = vocab
        self.tokenizer = tokenizer

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        tokens = self.tokenizer(self.texts.iloc[idx])
        indices = self.vocab(tokens)
        label = self.labels.iloc[idx]
        return torch.tensor(indices, dtype=torch.long), torch.tensor(label, dtype=torch.long)

In [None]:
def collate_batch(batch):
    texts, labels = zip(*batch)
    labels = torch.tensor([int(label) for label in labels], dtype=torch.long)  # Ensure labels are integers
    texts = [torch.tensor(text_transform(text)) for text in texts]
    texts = nn.utils.rnn.pad_sequence(texts, batch_first=True, padding_value=vocab["<pad>"])
    return texts, labels

In [None]:
train_dataloader = DataLoader(train_dataset, batch_size=512, shuffle=True, collate_fn=collate_batch)
val_dataloader = DataLoader(val_dataset, batch_size=512, shuffle=False, collate_fn=collate_batch)

In [None]:
# Step 7: Model Definition
class TextClassificationModel(nn.Module):
    def __init__(self, vocab_size, embed_dim, num_classes):
        super(TextClassificationModel, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embed_dim)
        self.fc = nn.Linear(embed_dim, num_classes)
        self.pool = nn.AdaptiveAvgPool1d(1)

    def forward(self, text):
        embedded = self.embedding(text)
        pooled = self.pool(embedded.permute(0, 2, 1)).squeeze(-1)
        return self.fc(pooled)

In [None]:
vocab_size = len(vocab)
embed_dim = 64
num_classes = 2

In [None]:
model = TextClassificationModel(vocab_size, embed_dim, num_classes)

In [None]:
# num_classes = 2
# input_dim = 768
# device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
# from torchtext.models import RobertaClassificationHead, XLMR_BASE_ENCODER

# classifier_head = RobertaClassificationHead(num_classes=num_classes, input_dim=input_dim)
# model = XLMR_BASE_ENCODER.get_model(head=classifier_head)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

In [None]:
import torchtext.functional as F
from torch.optim import AdamW

learning_rate = 1e-5
optim = AdamW(model.parameters(), lr=learning_rate)
criteria = nn.CrossEntropyLoss()

In [None]:
def train(dataloader, model, criterion, optimizer, device):
    model.train()
    total_loss = 0
    correct = 0
    total = 0

    for texts, labels in dataloader:
        texts, labels = texts.to(device), labels.to(device)
        optimizer.zero_grad()

        outputs = model(texts)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()

        total_loss += loss.item()
        preds = outputs.argmax(1)
        correct += (preds == labels).sum().item()
        total += labels.size(0)

    accuracy = correct / total
    avg_loss = total_loss / len(dataloader)
    return avg_loss, accuracy

In [None]:
def evaluate(dataloader, model, criterion, device):
    model.eval()
    total_loss = 0
    correct = 0
    total = 0

    with torch.no_grad():
        for texts, labels in dataloader:
            texts, labels = texts.to(device), labels.to(device)

            outputs = model(texts)
            loss = criterion(outputs, labels)

            total_loss += loss.item()
            preds = outputs.argmax(1)
            correct += (preds == labels).sum().item()
            total += labels.size(0)

    accuracy = correct / total
    avg_loss = total_loss / len(dataloader)
    return avg_loss, accuracy


In [None]:
from sklearn.metrics import precision_recall_fscore_support, confusion_matrix

def compute_metrics(dataloader, model, device):
    model.eval()
    all_preds = []
    all_labels = []

    with torch.no_grad():
        for texts, labels in dataloader:
            texts, labels = texts.to(device), labels.to(device)
            outputs = model(texts)
            preds = outputs.argmax(1)
            all_preds.extend(preds.cpu().numpy())
            all_labels.extend(labels.cpu().numpy())

    precision, recall, f1, _ = precision_recall_fscore_support(all_labels, all_preds, average="binary")
    cm = confusion_matrix(all_labels, all_preds)

    return precision, recall, f1, cm


In [None]:
num_epochs = 25
scheduler = torch.optim.lr_scheduler.StepLR(optim, step_size=5, gamma=0.1)
for epoch in range(num_epochs):
    train_loss, train_accuracy = train(train_dataloader, model, criteria, optim, device)
    val_loss, val_accuracy = evaluate(val_dataloader, model, criteria, device)
    print(f"Epoch {epoch + 1}/{num_epochs}")
    print(f"  Train Loss: {train_loss:.4f}, Train Accuracy: {train_accuracy:.4f}")
    print(f"  Validation Loss: {val_loss:.4f}, Validation Accuracy: {val_accuracy:.4f}")
    scheduler.step()
# Compute F1-score and Confusion Matrix after training
precision, recall, f1, cm = compute_metrics(val_dataloader, model, device)
print("\nMetrics on Validation Set:")
print(f"  Precision: {precision:.4f}, Recall: {recall:.4f}, F1-Score: {f1:.4f}")
print(f"  Confusion Matrix:\n{cm}")
