In [None]:
from datasets import load_dataset, concatenate_datasets
from transformers import DistilBertTokenizerFast, DistilBertForSequenceClassification
from transformers import Trainer, TrainingArguments
from sklearn.metrics import classification_report, accuracy_score, precision_recall_fscore_support
import numpy as np

In [None]:
# Step 1: Load datasets from Hugging Face
print("Loading datasets...")
datasets_to_use = [
    "JasperLS/prompt-injections",
    "rubend18/ChatGPT-Jailbreak-Prompts",
    "deepset/prompt-injections",
    "ahsanayub/malicious-prompts"
]

# Load and concatenate
datasets = []
for ds_name in datasets_to_use:
    ds = load_dataset(ds_name)
    if "train" in ds:
        datasets.append(ds["train"])
    else:
        # If dataset doesn't have train/test split, just grab all
        datasets.append(ds[list(ds.keys())[0]])

# Unify the datasets into one
dataset = concatenate_datasets(datasets)

Loading datasets...


NameError: name 'load_dataset' is not defined

In [None]:
# Step 3: Tokenization
tokenizer = DistilBertTokenizerFast.from_pretrained("distilbert-base-uncased")

# Step 2: Preprocess — ensure binary labels (0 = safe, 1 = malicious)
def normalize_labels(example):
    if "label" in example:
        example["label"] = int(example["label"] != 0)  # normalize to 0 or 1
    else:
        # Heuristic fallback if dataset uses 'malicious' bool or custom labels
        example["label"] = int("malicious" in str(example.get("tag", "")).lower() or
                               "jailbreak" in example["text"].lower())
    return example

def tokenize(example, tokenizer):
    return tokenizer(example["text"], padding="max_length", truncation=True, max_length=256)

In [None]:
# Step 3: Tokenization
dataset = dataset.map(normalize_labels)

dataset = dataset.map(tokenize, batched=True)

# Step 4: Split train/test and set format
dataset = dataset.train_test_split(test_size=0.2, seed=42)
dataset.set_format(type="torch", columns=["input_ids", "attention_mask", "label"])

In [None]:
# Step 5: Load model for binary classification
model = DistilBertForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels=2)

In [None]:
# Step 6: Metrics
def compute_metrics(pred):
    labels = pred.label_ids
    preds = np.argmax(pred.predictions, axis=1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='binary')
    acc = accuracy_score(labels, preds)
    return {"accuracy": acc, "f1": f1, "precision": precision, "recall": recall}


# Step 7: Training Arguments
training_args = TrainingArguments(
    output_dir="./distilbert-malicious-prompt",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
    logging_dir="./logs",
    logging_steps=10,
    save_strategy="epoch",
)


# Step 8: Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=dataset["train"],
    eval_dataset=dataset["test"],
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)



In [None]:

# Step 9: Train!
print("Starting training...")
trainer.train()

In [None]:
# Step 10: Evaluate
print("Evaluating...")
predictions = trainer.predict(dataset["test"])
print(classification_report(dataset["test"]["label"], np.argmax(predictions.predictions, axis=1)))
