In [2]:
# from https://github.com/jackhhao/llm-warden/blob/main/src/train.py

In [3]:
from datasets import load_dataset, ClassLabel
from transformers import (
    BertTokenizer,
    BertForSequenceClassification,
    TrainingArguments,
    Trainer
)
import numpy as np
import evaluate

  from .autonotebook import tqdm as notebook_tqdm


In [4]:
labels = ClassLabel(names=["benign", "jailbreak"])

# prepare and tokenize dataset
dataset = load_dataset("jackhhao/jailbreak-classification").rename_column("prompt", "text").rename_column("type", "label")
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

# map labels to IDs
id2label = {0: "benign", 1: "jailbreak"}
label2id = {value: key for key,value in id2label.items()}

def tokenize_function(examples):
    tokenized = tokenizer(examples["text"], padding="max_length", truncation=True)
    tokenized['label'] = labels.str2int(examples['label'])
    return tokenized

tokenized_datasets = dataset.map(tokenize_function, batched=True)

small_train_dataset = tokenized_datasets["train"].shuffle(seed=42)
small_eval_dataset = tokenized_datasets["test"].shuffle(seed=42)


In [5]:
tokenized_datasets

DatasetDict({
    train: Dataset({
        features: ['text', 'label', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 1044
    })
    test: Dataset({
        features: ['text', 'label', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 262
    })
})

In [6]:

# set up evaluation 
metric = evaluate.load("accuracy")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

# load pretrained model and evaluate model after each epoch
model = BertForSequenceClassification.from_pretrained(
    "bert-base-uncased",
    num_labels=2,
    id2label=id2label,
    label2id=label2id
)
training_args = TrainingArguments(
    output_dir="../training/",
    num_train_epochs=2,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=small_train_dataset,
    eval_dataset=small_eval_dataset,
    compute_metrics=compute_metrics,
)

trainer.train()

trainer.save_model("../model/")

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
                                                 
 50%|█████     | 131/262 [03:10<02:39,  1.22s/it]

{'eval_loss': 0.05411437526345253, 'eval_accuracy': 0.9847328244274809, 'eval_runtime': 13.4475, 'eval_samples_per_second': 19.483, 'eval_steps_per_second': 2.454, 'epoch': 1.0}


                                                 
100%|██████████| 262/262 [06:29<00:00,  1.19s/it]

{'eval_loss': 0.03156168386340141, 'eval_accuracy': 0.9923664122137404, 'eval_runtime': 11.757, 'eval_samples_per_second': 22.285, 'eval_steps_per_second': 2.807, 'epoch': 2.0}


100%|██████████| 262/262 [06:31<00:00,  1.50s/it]


{'train_runtime': 391.6981, 'train_samples_per_second': 5.331, 'train_steps_per_second': 0.669, 'train_loss': 0.10274617726566228, 'epoch': 2.0}
