In [21]:
import pandas as pd
from sklearn.model_selection import train_test_split
from transformers import BertTokenizer
from torch.utils.data import DataLoader, TensorDataset
import torch
from transformers import BertForSequenceClassification, AdamW
from tqdm import tqdm
from sklearn.metrics import f1_score, precision_score, recall_score
import numpy as np

In [18]:
file_path = 'suicide_detection.csv'
df = pd.read_csv(file_path)
# suicide = 1, non-suicide = 0
df['class'] = df['class'].map({'suicide': 1, 'non-suicide': 0})
# Split the dataset into training and validation sets
train_texts, val_texts, train_labels, val_labels = train_test_split(df['text'], df['class'], test_size=.2)

# Initialize the BERT tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Tokenize the texts
train_encodings = tokenizer(list(train_texts), truncation=True, padding=True, max_length=128, return_tensors="pt")
val_encodings = tokenizer(list(val_texts), truncation=True, padding=True, max_length=128, return_tensors="pt")

# Convert to torch tensors
train_dataset = TensorDataset(train_encodings['input_ids'], train_encodings['attention_mask'], torch.tensor(train_labels.values))
val_dataset = TensorDataset(val_encodings['input_ids'], val_encodings['attention_mask'], torch.tensor(val_labels.values))

# Creating Data Loaders
train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True, num_workers=2)
val_loader = DataLoader(val_dataset, batch_size=16, shuffle=False, num_workers=2)

In [19]:
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)  # Instantiate the model architecture
model.load_state_dict(torch.load('model_v2.pth'))  # Load the fine-tuned weights
model.eval()  # Set the model to evaluation mode

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12,

In [30]:
from transformers import BertTokenizer, BertForSequenceClassification
import torch


# Initialize the BERT tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
# Assuming you have already loaded your model and tokenizer
# model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)
# tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

def predict_single_text(model, tokenizer, text):
    # Tokenize the input text
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True, max_length=128)
    
    # Move tensors to the same device as model
    input_ids = inputs['input_ids'].to(model.device)
    attention_mask = inputs['attention_mask'].to(model.device)
    
    # Get model predictions
    with torch.no_grad():
        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        logits = outputs.logits
    
    # Apply softmax to logits to get probabilities
    probabilities = torch.softmax(logits, dim=-1)
    
    # Get the predicted class (0 or 1) based on the highest probability
    predicted_class = torch.argmax(probabilities, dim=-1).cpu().numpy()[0]  # Extract the predicted class
    
    # Optionally, convert probabilities to numpy for easier interpretation
    probabilities = probabilities.cpu().numpy()[0]
    
    return predicted_class, probabilities

# Example usage
text = "It is now 1:22AM and i am tired. i am depressed?"
predicted_class, probabilities = predict_single_text(model, tokenizer, text)
print(f"Predicted class: {predicted_class}")
print(f"Class probabilities: {probabilities}")

Predicted class: 0
Class probabilities: [0.867737   0.13226299]


In [22]:
# Generate predictions
predictions, true_labels = [], []

model.eval()
with torch.no_grad():
    for batch in tqdm(val_loader):
        input_ids, attention_mask, labels = batch  # Unpack the batch directly
        input_ids = input_ids.to(device)
        attention_mask = attention_mask.to(device)
        labels = labels.to(device)  # Ensure labels are also moved to the correct device
        
        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        logits = outputs.logits
        predictions.extend(torch.argmax(logits, dim=-1).cpu().numpy())
        true_labels.extend(labels.cpu().numpy())

# Calculate the F1 score
f1 = f1_score(true_labels, predictions)
print(f"F1 Score: {f1}")


100%|███████████████████████████████████████| 2901/2901 [17:26<00:00,  2.77it/s]

F1 Score: 0.9918493415125906





In [23]:
# Calculate precision, recall, and F1 score
precision = precision_score(true_labels, predictions)
recall = recall_score(true_labels, predictions)
f1 = f1_score(true_labels, predictions)

print(f"Precision: {precision}")
print(f"Recall: {recall}")
print(f"F1 Score: {f1}")

Precision: 0.9909141559165131
Recall: 0.9927862939585211
F1 Score: 0.9918493415125906
