# Finetuning ProkBERT using Adaptive curricular loss 
The loss is described in https://arxiv.org/abs/2004.00288

# Loading the data

For testing reasons we prepare small dataset based on 10 genomes from GTDB database. You can reach this dataset on HuggingFace

In [5]:
dataset_path = "./data/GTDB_bacterial_small_tokenized"

In [None]:
from huggingface_hub import snapshot_download

snapshot_download(
    repo_id="neuralbioinfo/GTDB_bacterial_small_tokenized", 
    repo_type="dataset", 
    local_dir=dataset_path,
)

Fetching 5 files: 100%|██████████| 5/5 [00:00<00:00,  7.83it/s]


'/leonardo_work/EUHPC_R04_194/prokbert/examples/data/GTDB_bacterial_small_tokenized'

Sequences in this dataset were already segemented and tokenized. These stpes are precisly described in Segmentation and Tokenization notebooks

In [12]:
import os
from datasets import load_dataset

dataset_training = load_dataset(
    "parquet",
    data_files=os.path.join(dataset_path, 'data', 'train-*.parquet'),
    split="train"
)
eval_dataset = load_dataset(
    "parquet",
    data_files=os.path.join(dataset_path, 'data', 'validation-*.parquet'),
    split="train"
)
test_dataset = load_dataset(
    "parquet",
    data_files=os.path.join(dataset_path, 'data', 'test-*.parquet'),
    split="train"
)

print(f"Training dataset size: {len(dataset_training)}")
print(f"Evaluation dataset size: {len(eval_dataset)}")
print(f"Test dataset size: {len(test_dataset)}")

Generating train split: 719 examples [00:00, 1967.28 examples/s]
Generating train split: 6514 examples [00:03, 2086.29 examples/s]

Training dataset size: 6123
Evaluation dataset size: 719
Test dataset size: 6514





In [13]:
dataset_training

Dataset({
    features: ['sequence_id', 'segment_start', 'input_ids', 'attention_mask', 'labels', 'token_length', 'unknown_token_count', 'full_labels'],
    num_rows: 6123
})

In [18]:
num_of_classes = len(dataset_training.unique('labels'))
print(f"Number of unique labels in training dataset: {num_of_classes}")

Number of unique labels in training dataset: 10


# Loading the model
For finetuning we will use model based on pretrained model - prokbert-mini-long

In [19]:
from prokbert.models import *
import torch

prokbert_base_model = "neuralbioinfo/prokbert-mini-long"

model = ProkBertForCurricularClassification.from_pretrained(
    prokbert_base_model,
    bert_base_model = prokbert_base_model,
    torch_dtype=torch.bfloat16,
    curricular_num_labels = num_of_classes,
    curricular_face_m = 0.5,
    curricular_face_s = 64.0,
    classification_dropout_rate = 0.1,
    curriculum_hidden_size = 128,
)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

print(model)

Some weights of ProkBertForCurricularClassification were not initialized from the model checkpoint at neuralbioinfo/prokbert-mini-long and are newly initialized: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'curricular_face.kernel', 'curricular_face.t', 'linear.bias', 'linear.weight', 'weighting_layer.bias', 'weighting_layer.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


ProkBertForCurricularClassification(
  (bert): ProkBertModel(
    (embeddings): MegatronBertEmbeddings(
      (word_embeddings): Embedding(4200, 384, padding_idx=0)
      (position_embeddings): Embedding(2048, 384)
      (token_type_embeddings): Embedding(2, 384)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): MegatronBertEncoder(
      (layer): ModuleList(
        (0-5): 6 x MegatronBertLayer(
          (attention): MegatronBertAttention(
            (ln): LayerNorm((384,), eps=1e-12, elementwise_affine=True)
            (self): MegatronBertSelfAttention(
              (query): Linear(in_features=384, out_features=384, bias=True)
              (key): Linear(in_features=384, out_features=384, bias=True)
              (value): Linear(in_features=384, out_features=384, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
              (distance_embedding): Embedding(4095, 64)
            )
            (output): MegatronBertSelfOutput(
              (dense

In [20]:
print(f"number of model parameters: {model.num_parameters()/1e6} million")

number of model parameters: 26.630657 million


As can you see the model structure almost the same as for ProkBERT model but with addition of two linear layers

# 3) Training set up

Set up main params for finetuning optimizer, scheduler, and trainining arguments

In [21]:
from transformers import AdamW

backbone_lr_rate = 0.0000160
head_lr_rate = 0.00048
beta_1 = 0.5794
beta_2 = 0.6576
bert_params = [p for n, p in model.named_parameters() if "model" in n] #bert
head_params = [p for n, p in model.named_parameters() if "model" not in n]

optimizer = AdamW([
    {"params": bert_params, "lr": backbone_lr_rate},
    {"params": head_params, "lr": head_lr_rate}
],
betas=(beta_1, beta_2))



In [22]:
from torch.optim.lr_scheduler import ReduceLROnPlateau
lr_scheduler = ReduceLROnPlateau(
    optimizer,
    mode="min",    
    factor=0.5,  
    patience=5,
    verbose=True
)

In [24]:
from transformers import TrainingArguments

output_dir = "./test_prokbert"
batch_size = 16

# set up number of steps 
num_epoches = 1
num_warmup = 0
count = len(dataset_training)
max_steps = int((count // (batch_size * 4)) * num_epoches)
eval_step = max_steps // 2


training_args = TrainingArguments(
    output_dir=output_dir,
    overwrite_output_dir = False,
    report_to="tensorboard",
    eval_strategy="steps",
    logging_strategy = "steps",
    max_steps = max_steps, 
    eval_steps = eval_step,
    logging_steps=10,
    save_steps = eval_step,
    save_total_limit=1,
    load_best_model_at_end=True,
    metric_for_best_model="eval_silhouette_score",
    greater_is_better=True,
    weight_decay=0.00002737,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    gradient_accumulation_steps=1,
    dataloader_num_workers=4,
    dataloader_prefetch_factor=1,
    torch_compile=False,
    bf16=True,
    max_grad_norm=1.0,
    ddp_find_unused_parameters=True,
)

In [27]:
from transformers import Trainer
from transformers import AutoTokenizer
from transformers import DataCollatorWithPadding
from transformers import EarlyStoppingCallback

tokenizer =  AutoTokenizer.from_pretrained(prokbert_base_model, trust_remote_code=True)
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=dataset_training,
    eval_dataset=eval_dataset,
    optimizers=(optimizer, lr_scheduler),
    processing_class=tokenizer,
    data_collator=data_collator,
    callbacks=[EarlyStoppingCallback(10)]
)

Detected kernel version 4.18.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


In [28]:
trainer.train()

  ctx_manager = torch.cpu.amp.autocast(cache_enabled=cache_enabled, dtype=self.amp_dtype)


: 