In [1]:
from transformers import RobertaTokenizer, RobertaForSequenceClassification, Trainer, TrainingArguments
from transformers import DataCollatorWithPadding
import evaluate

  from .autonotebook import tqdm as notebook_tqdm


In [4]:
from datasets import load_dataset
from datasets import Dataset
from sklearn.model_selection import train_test_split

dataset = load_dataset("ealvaradob/phishing-dataset", "combined_reduced", trust_remote_code=True)

df = dataset['train'].to_pandas()
train, test = train_test_split(df, test_size=0.2, random_state=42)
train, test = Dataset.from_pandas(train, preserve_index=False), Dataset.from_pandas(test, preserve_index=False)

Generating train split: 77677 examples [00:05, 14741.07 examples/s]


all records are labeled as 1 (Phishing) or 0 (Benign).

Load the tokenizer and the model

In [6]:
tokenizer = RobertaTokenizer.from_pretrained("roberta-base")
model = RobertaForSequenceClassification.from_pretrained("roberta-base", num_labels=2)

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Tokenize the datasets

In [7]:
def tokenize_function(examples):
    return tokenizer(examples["text"], truncation=True, padding=True, max_length=512)

train = train.map(tokenize_function, batched=True)
test = test.map(tokenize_function, batched=True)

Map: 100%|██████████| 62141/62141 [06:45<00:00, 153.34 examples/s]
Map: 100%|██████████| 15536/15536 [01:43<00:00, 150.02 examples/s]


Define the data collator

In [10]:
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

Define the evaluation metric using 'evaluate'

In [17]:
from sklearn.metrics import precision_score, recall_score, accuracy_score, confusion_matrix  # <-- Add this line

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = logits.argmax(axis=-1)
    
    # Calculate accuracy, precision, and recall
    accuracy = accuracy_score(labels, predictions)
    precision = precision_score(labels, predictions, zero_division=0)
    recall = recall_score(labels, predictions, zero_division=0)

    # Calculate confusion matrix to get False Positive Rate
    tn, fp, fn, tp = confusion_matrix(labels, predictions).ravel()
    fpr = fp / (fp + tn) if (fp + tn) > 0 else 0

    return {
        "accuracy": accuracy,
        "precision": precision,
        "recall": recall,
        "false_positive_rate": fpr
    }


Set up training arguments

In [13]:
training_args = TrainingArguments(
    output_dir="./results",
    eval_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
)

In [14]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train,
    eval_dataset=test,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics
)

Detected kernel version 4.18.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


In [18]:
trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,False Positive Rate
1,0.0865,0.218693,0.962796,0.98173,0.928866,0.012557
2,0.0542,0.147784,0.968975,0.978808,0.946765,0.014891
3,0.0269,0.179279,0.972709,0.97572,0.959003,0.017335


TrainOutput(global_step=11652, training_loss=0.05705107907597404, metrics={'train_runtime': 4704.0146, 'train_samples_per_second': 39.631, 'train_steps_per_second': 2.477, 'total_flos': 5.066571726434304e+16, 'train_loss': 0.05705107907597404, 'epoch': 3.0})