In [13]:
from transformers import BertForSequenceClassification, BertTokenizer, Trainer, TrainingArguments
from datasets import load_dataset
from huggingface_hub import login
from transformers import pipeline
import os
from dotenv import load_dotenv


In [14]:
dataset = load_dataset("jackhhao/jailbreak-classification")
# rename prompt to text and label to label
dataset = dataset.rename_column("prompt", "text")
dataset = dataset.rename_column("type", "labels")

def label_mapping(example):
    example["labels"] = 0 if example["labels"] == "benign" else 1
    return example

dataset = dataset.map(label_mapping)


  0%|          | 0/131 [01:26<?, ?it/s]


KeyboardInterrupt: 

In [11]:

tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=2)
def tokenize_function(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True)

tokenized_datasets = dataset.map(tokenize_function, batched=True)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [12]:
# i take this params from : https://huggingface.co/jackhhao/jailbreak-classifier
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    learning_rate=5e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=5, # need to be changed to 5
    lr_scheduler_type="linear",
   # logging_dir="./logs",
    save_strategy="no",
)
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["test"],
)
trainer.train()
trainer.evaluate()



KeyboardInterrupt: 

In [51]:

# Load the trained model and tokenizer
model = BertForSequenceClassification.from_pretrained(output_dir)
tokenizer = BertTokenizer.from_pretrained(output_dir)

# Example usage: Make predictions with the loaded model
inputs = tokenizer("Sample text for classification", return_tensors="pt")
outputs = model(**inputs)
predictions = outputs.logits.argmax(dim=-1)
# map predictions from 0 and 1 to benign and malicious
# from tensor to list of strings
predictions = [ "benign" if x == 0 else "malicious" for x in predictions.tolist()]
print(predictions)

['benign']


In [None]:
load_dotenv()
if 'HF_TOKEN' in os.environ:
    print("pushing to huggingface hub")
    HF_TOKEN = os.getenv("HF_TOKEN")
    login(token=HF_TOKEN)  
    model.push_to_hub("oranne55/qualifier-model3-finetune-pretrained-transformer")
    tokenizer.push_to_hub("oranne55/qualifier-model3-finetune-pretrained-transformer")

In [8]:

pipe = pipeline("text-classification", model="oranne55/qualifier-model3-finetune-pretrained-transformer")

Hardware accelerator e.g. GPU is available in the environment, but no `device` argument is passed to the `Pipeline` object. Model will be on CPU.


In [9]:
# Define some test input texts
test_texts = [
    "I love the new design of your product!",
    "This service was terrible and I am very disappointed.",
    "The team did a great job and exceeded my expectations.",
    "I would not recommend this to anyone."
]

# Classify the test inputs
results = pipe(test_texts)

In [10]:
results

[{'label': 'benign', 'score': 0.9999874830245972},
 {'label': 'benign', 'score': 0.9999877214431763},
 {'label': 'benign', 'score': 0.9999895095825195},
 {'label': 'benign', 'score': 0.999993085861206}]