In [3]:
import pandas as pd
from sklearn.model_selection import train_test_split
from transformers import BertTokenizer
from torch.utils.data import DataLoader, TensorDataset
import torch
from transformers import BertForSequenceClassification, AdamW
from tqdm import tqdm
import pickle

In [2]:
file_path = 'suicide_detection.csv'
df = pd.read_csv(file_path)

In [3]:
# suicide = 1, non-suicide = 0
df['class'] = df['class'].map({'suicide': 1, 'non-suicide': 0})


In [4]:
# Split the dataset into training and validation sets
train_texts, val_texts, train_labels, val_labels = train_test_split(df['text'], df['class'], test_size=.2)

# Initialize the BERT tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

In [5]:
# Tokenize the texts
train_encodings = tokenizer(list(train_texts), truncation=True, padding=True, max_length=128, return_tensors="pt")
val_encodings = tokenizer(list(val_texts), truncation=True, padding=True, max_length=128, return_tensors="pt")

In [6]:
# Convert to torch tensors
train_dataset = TensorDataset(train_encodings['input_ids'], train_encodings['attention_mask'], torch.tensor(train_labels.values))
val_dataset = TensorDataset(val_encodings['input_ids'], val_encodings['attention_mask'], torch.tensor(val_labels.values))

In [7]:
# Creating Data Loaders
train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True, num_workers=2)
val_loader = DataLoader(val_dataset, batch_size=16, shuffle=False, num_workers=2)

In [8]:
# Setting device to the most optimized available option (CPU for Apple Silicon, unless MPS is configured)
device = torch.device("mps" if torch.backends.mps.is_available() else "cpu")  # 'mps' for Metal Performance Shaders, if available
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)
model.to(device)

optimizer = AdamW(model.parameters(), lr=2e-5, weight_decay=0.01)  # Example: adding weight decay for L2 regularization

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [69]:
epochs = 2

for epoch in range(epochs):
    model.train()
    train_loss = 0
    for batch in tqdm(train_loader):
        batch = {k: v.to(device) for k, v in zip(['input_ids', 'attention_mask', 'labels'], batch)}
        outputs = model(**batch)
        loss = outputs.loss
        train_loss += loss.item()
        
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()
    print(f"Epoch {epoch + 1}/{epochs}, Train Loss: {train_loss/len(train_loader)}")
    
    model.eval()
    val_loss = 0
    with torch.no_grad():
        for batch in tqdm(val_loader):
            batch = {k: v.to(device) for k, v in zip(['input_ids', 'attention_mask', 'labels'], batch)}
            outputs = model(**batch)
            loss = outputs.loss
            val_loss += loss.item()
    print(f"Epoch {epoch + 1}/{epochs}, Val Loss: {val_loss/len(val_loader)}")

100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 11604/11604 [56:59<00:00,  3.39it/s]


Epoch 1/2, Train Loss: 0.04714729780526478


100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 2901/2901 [03:42<00:00, 13.03it/s]


Epoch 1/2, Val Loss: 0.06261578886086308


  0%|                                                                                                                                       | 0/11604 [00:01<?, ?it/s]


KeyboardInterrupt: 

In [70]:
# Assuming 'model' is your trained model
torch.save(model.state_dict(), 'model_v2.pth')


In [74]:
from sklearn.metrics import f1_score
import numpy as np

In [1]:
# Generate predictions
predictions, true_labels = [], []

model.eval()
with torch.no_grad():
    for batch in tqdm(val_loader):
        input_ids, attention_mask, labels = batch  # Unpack the batch directly
        input_ids = input_ids.to(device)
        attention_mask = attention_mask.to(device)
        labels = labels.to(device)  # Ensure labels are also moved to the correct device
        
        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        logits = outputs.logits
        predictions.extend(torch.argmax(logits, dim=-1).cpu().numpy())
        true_labels.extend(labels.cpu().numpy())

# Calculate the F1 score
f1 = f1_score(true_labels, predictions)
print(f"F1 Score: {f1}")



NameError: name 'model' is not defined

In [27]:
from sklearn.metrics import f1_score, precision_score, recall_score


In [28]:

# Calculate precision, recall, and F1 score
precision = precision_score(true_labels, predictions)
recall = recall_score(true_labels, predictions)
f1 = f1_score(true_labels, predictions)

print(f"Precision: {precision}")
print(f"Recall: {recall}")
print(f"F1 Score: {f1}")

Precision: 0.9795757363253857
Recall: 0.970894874022589
F1 Score: 0.9752159874334584


In [68]:
from transformers import BertTokenizer, BertForSequenceClassification
import torch

# Assuming you have already loaded your model and tokenizer
# model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)
# tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

def predict_single_text(model, tokenizer, text):
    # Tokenize the input text
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True, max_length=128)
    
    # Move tensors to the same device as model
    input_ids = inputs['input_ids'].to(model.device)
    attention_mask = inputs['attention_mask'].to(model.device)
    
    # Get model predictions
    with torch.no_grad():
        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        logits = outputs.logits
    
    # Apply softmax to logits to get probabilities
    probabilities = torch.softmax(logits, dim=-1)
    
    # Get the predicted class (0 or 1) based on the highest probability
    predicted_class = torch.argmax(probabilities, dim=-1).cpu().numpy()[0]  # Extract the predicted class
    
    # Optionally, convert probabilities to numpy for easier interpretation
    probabilities = probabilities.cpu().numpy()[0]
    
    return predicted_class, probabilities

# Example usage
text = "I'm so tired I wanna die. Huh I'll be fine"
predicted_class, probabilities = predict_single_text(model, tokenizer, text)
print(f"Predicted class: {predicted_class}")
print(f"Class probabilities: {probabilities}")


Predicted class: 0
Class probabilities: [0.51949364 0.4805063 ]
