# Fine-tuning script

*Anusha, Joyce, Nina*

#### Imports

In [2]:
import pandas as pd
import torch
from transformers import AutoTokenizer, AutoModel, AutoModelForSequenceClassification, DataCollatorForTokenClassification
from transformers.models.bert.configuration_bert import BertConfig
import numpy as np
from datasets import load_dataset

import csv
from Bio import SeqIO
#import evaluate

In [11]:
# Add ids and comment out others when using
princeton_id = 'aa8417'
#princeton_id = 'ns...'
#princeton_id = 'jf...'

project_dir = f'/scratch/gpfs/{princeton_id}/QCB557_project'

model_name = 'fine_tune_v1'
model_out_dir = f'{project_dir}/models/{model_name}'

#### Load model and tokenizer from Hugging Face

In [4]:
# use gpu
device = 'cuda:0'

config = BertConfig.from_pretrained("zhihan1996/DNABERT-2-117M")
config.num_labels #2 labels
model = AutoModelForSequenceClassification.from_pretrained("zhihan1996/DNABERT-2-117M", trust_remote_code=True, config=config)
model.to(device)

# don't need to move the tokenizer to gpu b/c it's light
# use data collator to pad sequences dynamically during training
tokenizer = AutoTokenizer.from_pretrained("zhihan1996/DNABERT-2-117M", trust_remote_code=True, padding=False)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at zhihan1996/DNABERT-2-117M and are newly initialized: ['bert.pooler.dense.weight', 'classifier.bias', 'bert.pooler.dense.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [6]:
print(model)

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(4096, 768, padding_idx=0)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertUnpadAttention(
            (self): BertUnpadSelfAttention(
              (dropout): Dropout(p=0.0, inplace=False)
              (Wqkv): Linear(in_features=768, out_features=2304, bias=True)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
          )
          (mlp): BertGatedLinearUnitMLP(
            (gated_layers): Linear(in_feat

#### Freeze gradients

We can play around with this later as we imrove the performance of the fine-tuned model.

In [5]:
# default when using AutoModelForSequenceClassification is that all pretrained parameters/layers are frozen except classification head
# as we go through rounds of fine-tuning, we can optionally unfreeze from of the pretrained layers to improve performance

# unfreeze the last layer in the encoder block
for name, param in model.named_parameters():
    if "encoder.layer.11" in name:
        param.requires_grad = True
    else:
        param.requires_grad = False

In [42]:
print(model)

BertModel(
  (embeddings): BertEmbeddings(
    (word_embeddings): Embedding(4096, 768, padding_idx=0)
    (token_type_embeddings): Embedding(2, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): BertEncoder(
    (layer): ModuleList(
      (0-11): 12 x BertLayer(
        (attention): BertUnpadAttention(
          (self): BertUnpadSelfAttention(
            (dropout): Dropout(p=0.0, inplace=False)
            (Wqkv): Linear(in_features=768, out_features=2304, bias=True)
          )
          (output): BertSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
        )
        (mlp): BertGatedLinearUnitMLP(
          (gated_layers): Linear(in_features=768, out_features=6144, bias=False)
          (act): GELU(approximate='none')
  

#### Load data from Hugging Face

In [7]:
class custom_data_load(torch.utils.data.Dataset):
    def __init__(self, dataframe, tokenizer, shuffle=True):
        if shuffle:
            self.dataframe = dataframe.sample(frac=1).reset_index(drop=True)  # shuffle the dataframe
        else:
            self.dataframe = dataframe
        self.tokenizer = tokenizer

    def __len__(self):
        return len(self.dataframe)

    def __getitem__(self, idx):
        sequence = self.dataframe.iloc[idx]['sequence']
        label = self.dataframe.iloc[idx]['label']

        # tokenize the sequence
        # tokenizer automatically generates attention masks
        inputs = self.tokenizer(sequence, padding='max_length', max_length=500, truncation=True, return_tensors='pt')
        
        # move inputs to gpu
        inputs = {key: value.to(device) for key, value in inputs.items()}

        return {
            'input_ids': inputs['input_ids'].squeeze(),
            'attention_mask': inputs['attention_mask'].squeeze(),
            'labels': torch.tensor(label, dtype=torch.long)
        }

#### load in train and test data

In [8]:
train_data = pd.read_csv(f'{project_dir}/data/train.csv')
train_ds = custom_data_load(dataframe = train_data, tokenizer = tokenizer)

In [9]:
test_data = pd.read_csv(f'{project_dir}/data/test.csv')
test_ds = custom_data_load(dataframe = test_data, tokenizer = tokenizer)

In [10]:
train_ds.__getitem__(0)

{'input_ids': tensor([   1,   72, 2735,  136,   53, 3354, 1966,   52,  253,   72,  446, 2484,
         3638,  114,   71,  347,   83, 1359,  132,  653,  303,  241,   37, 3838,
          145,   46,   79,  473,  146,  712,  841,  212,  610,  284,   77,   16,
          269,   43, 1390,   50,   65,  103, 1334,   52,  387,  429,  101,  260,
           23,  140,   72,   42, 1175,  245,  303,   50, 1534,   15,  168,  240,
          207,  886,  245,   42,   41,  649,   33,  126,  213,  110, 2215,  191,
         1778,   49,  200, 1007,   79,  138,   71,  135,  495,  997,   32,  189,
           57, 2284,  430,   83, 3697,  291, 1875,   28,  147,   20, 1388,   65,
           46,  133,   63, 1102,  831,  162,   20,    8,    2,    3,    3,    3,
            3,    3,    3,    3,    3,    3,    3,    3,    3,    3,    3,    3,
            3,    3,    3,    3,    3,    3,    3,    3,    3,    3,    3,    3,
            3,    3,    3,    3,    3,    3,    3,    3,    3,    3,    3,    3,
            3, 

#### training arguments

In [None]:
from transformers import TrainingArguments

#TODO: find optimal hyperparameters
training_args = TrainingArguments(
        per_device_train_batch_size=16, #powers of 2
        gradient_accumulation_steps=2,
        gradient_checkpointing=True,
        dataloader_num_workers=4,
        
        fp16=True, 
        weight_decay=0.015, 
        num_train_epochs=2, 
        log_level="error",
        output_dir=model_out_dir,
        evaluation_strategy="steps",  
        eval_steps=800, 
         save_strategy="epoch",        
         logging_steps=800,
         logging_strategy="steps",
         logging_dir=logging_dir,
         overwrite_output_dir = True,
    )

#### metrics to compute

In [None]:
#TODO:  define custom metrics function
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

#### Trainer

In [None]:
data_collator = DataCollatorForTokenClassification(tokenizer, label_pad_token_id='X')

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_ds,
    eval_dataset=test_ds,
    compute_metrics=compute_metrics,
    data_collator = data_collator
)

trainer.train()