<a href="https://colab.research.google.com/github/perfect7613/modernbert-finetuning/blob/main/modernbert.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install transformers datasets evaluate scikit-learn

Collecting datasets
  Downloading datasets-3.4.1-py3-none-any.whl.metadata (19 kB)
Collecting evaluate
  Downloading evaluate-0.4.3-py3-none-any.whl.metadata (9.2 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)
Downloading datasets-3.4.1-py3-none-any.whl (487 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m487.4/487.4 kB[0m [31m8.8 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading evaluate-0.4.3-py3-none-any.whl (84 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.0/84.0 kB[0m [31m6.8 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3

In [2]:
from datasets import load_dataset

from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer

import evaluate
import numpy as np
from transformers import DataCollatorWithPadding

### load data

In [3]:
dataset_dict = load_dataset("shawhin/phishing-site-classification")

README.md:   0%|          | 0.00/1.45k [00:00<?, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/98.0k [00:00<?, ?B/s]

validation-00000-of-00001.parquet:   0%|          | 0.00/21.4k [00:00<?, ?B/s]

test-00000-of-00001.parquet:   0%|          | 0.00/24.5k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/2100 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/450 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/450 [00:00<?, ? examples/s]

In [4]:
dataset_dict

DatasetDict({
    train: Dataset({
        features: ['text', 'labels'],
        num_rows: 2100
    })
    validation: Dataset({
        features: ['text', 'labels'],
        num_rows: 450
    })
    test: Dataset({
        features: ['text', 'labels'],
        num_rows: 450
    })
})

In [5]:
# Load model directly
model_path = "answerdotai/ModernBERT-base"

tokenizer = AutoTokenizer.from_pretrained(model_path)

id2label = {0: "Safe", 1: "Not Safe"}
label2id = {"Safe": 0, "Not Safe": 1}
model = AutoModelForSequenceClassification.from_pretrained(model_path,
                                                           num_labels=2,
                                                           id2label=id2label,
                                                           label2id=label2id,)

tokenizer_config.json:   0%|          | 0.00/20.8k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.13M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/694 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.19k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/599M [00:00<?, ?B/s]

Some weights of ModernBertForSequenceClassification were not initialized from the model checkpoint at answerdotai/ModernBERT-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


#### Freeze base model

In [6]:
for name, param in model.named_parameters():
   print(name, param.requires_grad)

model.embeddings.tok_embeddings.weight True
model.embeddings.norm.weight True
model.layers.0.attn.Wqkv.weight True
model.layers.0.attn.Wo.weight True
model.layers.0.mlp_norm.weight True
model.layers.0.mlp.Wi.weight True
model.layers.0.mlp.Wo.weight True
model.layers.1.attn_norm.weight True
model.layers.1.attn.Wqkv.weight True
model.layers.1.attn.Wo.weight True
model.layers.1.mlp_norm.weight True
model.layers.1.mlp.Wi.weight True
model.layers.1.mlp.Wo.weight True
model.layers.2.attn_norm.weight True
model.layers.2.attn.Wqkv.weight True
model.layers.2.attn.Wo.weight True
model.layers.2.mlp_norm.weight True
model.layers.2.mlp.Wi.weight True
model.layers.2.mlp.Wo.weight True
model.layers.3.attn_norm.weight True
model.layers.3.attn.Wqkv.weight True
model.layers.3.attn.Wo.weight True
model.layers.3.mlp_norm.weight True
model.layers.3.mlp.Wi.weight True
model.layers.3.mlp.Wo.weight True
model.layers.4.attn_norm.weight True
model.layers.4.attn.Wqkv.weight True
model.layers.4.attn.Wo.weight Tru

In [7]:
# freeze base model parameters
for name, param in model.base_model.named_parameters():
    param.requires_grad = False

# unfreeze base model pooling layers
for name, param in model.base_model.named_parameters():
    if "pooler" in name:
        param.requires_grad = True

In [8]:
# print layers
for name, param in model.named_parameters():
   print(name, param.requires_grad)

model.embeddings.tok_embeddings.weight False
model.embeddings.norm.weight False
model.layers.0.attn.Wqkv.weight False
model.layers.0.attn.Wo.weight False
model.layers.0.mlp_norm.weight False
model.layers.0.mlp.Wi.weight False
model.layers.0.mlp.Wo.weight False
model.layers.1.attn_norm.weight False
model.layers.1.attn.Wqkv.weight False
model.layers.1.attn.Wo.weight False
model.layers.1.mlp_norm.weight False
model.layers.1.mlp.Wi.weight False
model.layers.1.mlp.Wo.weight False
model.layers.2.attn_norm.weight False
model.layers.2.attn.Wqkv.weight False
model.layers.2.attn.Wo.weight False
model.layers.2.mlp_norm.weight False
model.layers.2.mlp.Wi.weight False
model.layers.2.mlp.Wo.weight False
model.layers.3.attn_norm.weight False
model.layers.3.attn.Wqkv.weight False
model.layers.3.attn.Wo.weight False
model.layers.3.mlp_norm.weight False
model.layers.3.mlp.Wi.weight False
model.layers.3.mlp.Wo.weight False
model.layers.4.attn_norm.weight False
model.layers.4.attn.Wqkv.weight False
model.

#### Preprocess text

In [9]:
# define text preprocessing
def preprocess_function(examples):
    return tokenizer(examples["text"], truncation=True)

In [10]:
# tokenize all datasetse
tokenized_data = dataset_dict.map(preprocess_function, batched=True)

Map:   0%|          | 0/2100 [00:00<?, ? examples/s]

Map:   0%|          | 0/450 [00:00<?, ? examples/s]

Map:   0%|          | 0/450 [00:00<?, ? examples/s]

In [11]:
# create data collator
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

#### Evaluation

In [12]:
# load metrics
accuracy = evaluate.load("accuracy")
auc_score = evaluate.load("roc_auc")

def compute_metrics(eval_pred):
    # get predictions
    predictions, labels = eval_pred

    # apply softmax to get probabilities
    probabilities = np.exp(predictions) / np.exp(predictions).sum(-1, keepdims=True)
    # use probabilities of the positive class for ROC AUC
    positive_class_probs = probabilities[:, 1]
    # compute auc
    auc = np.round(auc_score.compute(prediction_scores=positive_class_probs, references=labels)['roc_auc'],3)

    # predict most probable class
    predicted_classes = np.argmax(predictions, axis=1)
    # compute accuracy
    acc = np.round(accuracy.compute(predictions=predicted_classes, references=labels)['accuracy'],3)

    return {"Accuracy": acc, "AUC": auc}

Downloading builder script:   0%|          | 0.00/4.20k [00:00<?, ?B/s]

Downloading builder script:   0%|          | 0.00/9.54k [00:00<?, ?B/s]

#### Train model

In [13]:
# hyperparameters
lr = 2e-4
batch_size = 8
num_epochs = 15

training_args = TrainingArguments(
    output_dir="modernbert-phishing-classifier",
    learning_rate=lr,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=num_epochs,
    logging_strategy="epoch",
    eval_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
)

In [14]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_data["train"],
    eval_dataset=tokenized_data["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

trainer.train()

  trainer = Trainer(
[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
wandb: Paste an API key from your profile and hit enter:

 ··········


[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mameymuke252003[0m ([33mameymuke252003-n-a[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


W0317 16:29:01.314000 1316 torch/_inductor/utils.py:1137] [1/0] Not enough SMs to use max_autotune_gemm mode


Epoch,Training Loss,Validation Loss,Accuracy,Auc
1,0.3714,0.293642,0.869,0.949
2,0.2622,0.268146,0.884,0.96
3,0.2405,0.26425,0.898,0.961
4,0.2091,0.268754,0.893,0.963
5,0.2078,0.381334,0.882,0.962
6,0.1887,0.266715,0.9,0.965
7,0.1695,0.285132,0.902,0.964
8,0.1654,0.293506,0.902,0.964
9,0.157,0.316902,0.904,0.966
10,0.158,0.319016,0.896,0.964


TrainOutput(global_step=3945, training_loss=0.1880220314093567, metrics={'train_runtime': 348.1302, 'train_samples_per_second': 90.483, 'train_steps_per_second': 11.332, 'total_flos': 1381898187616464.0, 'train_loss': 0.1880220314093567, 'epoch': 15.0})

### Apply Model to Validation Dataset

In [15]:
# apply model to validation dataset
predictions = trainer.predict(tokenized_data["validation"])

# Extract the logits and labels from the predictions object
logits = predictions.predictions
labels = predictions.label_ids

# Use your compute_metrics function
metrics = compute_metrics((logits, labels))
print(metrics)

{'Accuracy': 0.878, 'AUC': 0.955}


### Push to hub

In [16]:
# push model to hub
trainer.push_to_hub()

model.safetensors:   0%|          | 0.00/598M [00:00<?, ?B/s]

events.out.tfevents.1742228917.43e3db9356a5.1316.0:   0%|          | 0.00/14.9k [00:00<?, ?B/s]

training_args.bin:   0%|          | 0.00/5.37k [00:00<?, ?B/s]

Upload 3 LFS files:   0%|          | 0/3 [00:00<?, ?it/s]

CommitInfo(commit_url='https://huggingface.co/Perfect7613/modernbert-phishing-classifier/commit/377d4f9330a47b8f3d33ddaeb8d0e13d7474b9f6', commit_message='End of training', commit_description='', oid='377d4f9330a47b8f3d33ddaeb8d0e13d7474b9f6', pr_url=None, repo_url=RepoUrl('https://huggingface.co/Perfect7613/modernbert-phishing-classifier', endpoint='https://huggingface.co', repo_type='model', repo_id='Perfect7613/modernbert-phishing-classifier'), pr_revision=None, pr_num=None)