In [10]:
from transformers import pipeline
from datasets import load_dataset
import evaluate


In [11]:
pipe = pipeline("text-classification", model="jackhhao/jailbreak-classifier")
dataset = load_dataset("jackhhao/jailbreak-classification")


Hardware accelerator e.g. GPU is available in the environment, but no `device` argument is passed to the `Pipeline` object. Model will be on CPU.


In [12]:
dataset

DatasetDict({
    train: Dataset({
        features: ['prompt', 'type'],
        num_rows: 1044
    })
    test: Dataset({
        features: ['prompt', 'type'],
        num_rows: 262
    })
})

In [17]:
# Load all metrics from the evaluate library
f1_metric = evaluate.load("f1")
accuracy_metric = evaluate.load("accuracy")
recall_metric = evaluate.load("recall")
precision_metric = evaluate.load("precision")

In [19]:
# Define a function to evaluate predictions on the test set
def evaluate_model(model_pipeline, dataset):
    predictions = []
    references = []

    for sample in dataset:
        # Get model prediction with truncation
        prediction = model_pipeline(sample["prompt"], truncation=True)[0]
        
        # Map text labels to numerical labels
        if prediction["label"].lower() == "benign":
            predicted_label = 0
        elif prediction["label"].lower() == "jailbreak":
            predicted_label = 1
        else:
            raise ValueError(f"Unexpected label format: {prediction['label']}")

        predictions.append(predicted_label)
        references.append(1 if sample["type"].lower() == "jailbreak" else 0)  # Adjust label mapping accordingly

    # Compute each metric
    f1_score = f1_metric.compute(predictions=predictions, references=references, average="binary")
    accuracy = accuracy_metric.compute(predictions=predictions, references=references)
    recall = recall_metric.compute(predictions=predictions, references=references, average="binary")
    precision = precision_metric.compute(predictions=predictions, references=references, average="binary")

    return {
        "F1 Score": f1_score["f1"],
        "Accuracy": accuracy["accuracy"],
        "Recall": recall["recall"],
        "Precision": precision["precision"]
    }

# Evaluate on the test set
metrics = evaluate_model(pipe, test_dataset)
print("Evaluation Results:")
for metric_name, metric_value in metrics.items():
    print(f"{metric_name}: {metric_value}")

Evaluation Results:
F1 Score: 0.9747292418772563
Accuracy: 0.9732824427480916
Recall: 0.9712230215827338
Precision: 0.9782608695652174
