In [14]:
# Gerekli kütüphaneleri içe aktar
from transformers import BertForSequenceClassification, BertTokenizer
import torch
from sklearn.preprocessing import MultiLabelBinarizer
import numpy as np
import joblib
import os

# Current working directory
current_directory = os.getcwd()


# Kaydedilen model ve tokenizer'ı yükle
model = BertForSequenceClassification.from_pretrained(current_directory)
tokenizer = BertTokenizer.from_pretrained(current_directory)
# Load the MultiLabelBinarizer
multilabel = joblib.load(current_directory + "/multilabel_binarizer.pkl")


In [17]:
text = '''Subject: Get 80% OFF! Comprehensive Health Check-Up Special

Dear [Recipient],

Your health is our priority! That’s why we’re offering you an exclusive, one-day-only deal:
 Complete blood test and full health check-up for just $19.99!
FREE cholesterol, sugar, and blood pressure screenings.
Online consultation with our certified health specialists.

It’s quick and easy to book! Click here now to claim this incredible offer.

 Hurry, offer expires at midnight!

Remember, regular health checks are essential for a healthy life. Don’t miss this chance to take control of your well-being!

Best regards,
HealthCare Plus
Visit our website | Contact us.'''

encoding = tokenizer(text, return_tensors='pt')
# Model ile tahmin yap
with torch.no_grad():
    outputs = model(**encoding)

sigmoid = torch.nn.Sigmoid()
probs = sigmoid(outputs.logits[0].cpu())
preds = np.zeros(probs.shape)
preds[np.where(probs>=0.3)] = 1

multilabel.classes_

multilabel.inverse_transform(preds.reshape(1,-1))

[('health', 'scam', 'spam')]

In [16]:
import pandas as pd
import numpy as np
import torch

# Load your CSV file
input_csv_path = "test.csv"  # Replace with your input file path
output_csv_path = "test_out.csv"  # Replace with your desired output file path

# Load the CSV into a DataFrame
data = pd.read_csv(input_csv_path)

# Assuming your model, tokenizer, and multilabel binarizer are already loaded
# Replace 'text_column' with the column in your CSV that contains the text to predict
text_column = "Body"

# Prepare an empty list to store predictions
predictions = []

# Define sigmoid activation for probabilities
sigmoid = torch.nn.Sigmoid()

# Process each row of the CSV
for index, row in data.iterrows():
    # Tokenize the text with truncation to ensure no input exceeds the model's max length
    text = row[text_column]
    encoding = tokenizer(text, return_tensors='pt', truncation=True, max_length=512)  # Truncating input to 512 tokens
    
    with torch.no_grad():
        outputs = model(**encoding)

    # Apply sigmoid activation directly on the logits (tensor)
    probs = sigmoid(outputs.logits[0].cpu())  # Apply sigmoid on tensor directly

    # Convert to numpy for thresholding
    probs_np = probs.detach().numpy()

    # Threshold for multilabel classification
    preds = np.zeros(probs_np.shape)
    preds[np.where(probs_np >= 0.3)] = 1  # Threshold set to 0.3

    # Decode predicted labels using multilabel.inverse_transform
    decoded_labels = multilabel.inverse_transform(preds.reshape(1, -1))
    predictions.append(decoded_labels[0])  # Append the decoded labels as a list

# Add predictions to the DataFrame
data["predictions"] = [", " + ", ".join(pred) for pred in predictions]  # Join labels with ';'

# Save the new DataFrame with predictions to a CSV file
data.to_csv(output_csv_path, index=False)

print(f"Predictions saved to {output_csv_path}")


Predictions saved to test_out.csv


In [7]:
import pandas as pd
from sklearn.metrics import classification_report
from sklearn.preprocessing import MultiLabelBinarizer

# Load your CSV file containing true and predicted labels
input_csv_path = "test_out.csv"  # Replace with the actual path of your CSV file
data = pd.read_csv(input_csv_path)

# Prepend a comma to every value in the 'Labels' column
data['Labels'] = data['Labels'].apply(lambda x: f", {x}")

# Save the updated DataFrame back to the same file
data.to_csv(input_csv_path, index=False)

# Load your CSV file containing true and predicted labels
input_csv_path = "test_out.csv"  # Replace with the actual path of your CSV file
data = pd.read_csv(input_csv_path)

# Assuming 'Labels1' column contains the true labels and 'predictions1' column contains the predicted labels
true_labels = data['Labels']  # Column with true labels
predicted_labels = data['predictions']  # Column with predicted labels

# Split the true and predicted labels by commas and strip spaces, convert them to sets
y_true = true_labels.apply(lambda x: set(label.strip() for label in x.split(',')))  # Remove spaces and use sets to ignore order
y_pred = predicted_labels.apply(lambda x: set(label.strip() for label in x.split(',')))  # Remove spaces and use sets to ignore order

# Use MultiLabelBinarizer to transform labels for classification report
mlb = MultiLabelBinarizer()
y_true_bin = mlb.fit_transform(y_true)
y_pred_bin = mlb.transform(y_pred)

# Generate classification report
report = classification_report(y_true_bin, y_pred_bin, target_names=mlb.classes_)
print("Classification Report:")
print(report)

# Calculate accuracy for each label separately
accuracies = {}
for idx, label in enumerate(mlb.classes_):
    true_for_label = y_true_bin[:, idx]
    pred_for_label = y_pred_bin[:, idx]
    accuracy_for_label = (true_for_label == pred_for_label).mean()  # Compute accuracy for this label
    accuracies[label] = accuracy_for_label

# Print the accuracy for each label
print("\nAccuracy for each label:")
for label, accuracy in accuracies.items():
    print(f"Accuracy for '{label}': {accuracy * 100:.2f}%")


Classification Report:
                    precision    recall  f1-score   support

                         1.00      1.00      1.00       299
     advertisement       0.55      0.48      0.51        44
               ham       0.95      0.94      0.95       121
            health       0.76      0.89      0.82        38
illegal activities       0.23      0.50      0.32        14
          indecent       0.50      0.68      0.58        28
        investment       0.14      0.43      0.21         7
        irrelevant       0.54      0.48      0.51        44
            normal       0.94      0.94      0.94       120
            pshing       0.65      0.59      0.62        63
              scam       0.76      0.89      0.82       106
    sexual content       0.32      0.67      0.43        12
              spam       0.86      0.96      0.91       160

         micro avg       0.81      0.88      0.84      1056
         macro avg       0.63      0.73      0.66      1056
      weighted 