# Fine-tuning script

*Anusha, Joyce, Nina*

#### Imports

In [22]:
import pandas as pd
import torch
from transformers import AutoTokenizer, AutoModel
from transformers.models.bert.configuration_bert import BertConfig
import numpy as np
from datasets import load_dataset

import csv
from Bio import SeqIO
import evaluate

In [23]:
project_dir = '/scratch/gpfs/ns5404/QCB557_project'

model_out_dir

#### Load model and tokenizer from Hugging Face

In [61]:
# use gpu
#device = 'cuda' if torch.cuda.is_available() else 'cpu'
#model.to(device)

device = 'cuda:0'

config = BertConfig.from_pretrained("zhihan1996/DNABERT-2-117M")
config.num_labels #2 labels
model = AutoModel.from_pretrained("zhihan1996/DNABERT-2-117M", trust_remote_code=True, config=config).to(device)

# don't need to move the tokenizer to gpu b/c it's light
tokenizer = AutoTokenizer.from_pretrained("zhihan1996/DNABERT-2-117M", trust_remote_code=True)
tokenizer.pad_token = "X"

Some weights of BertModel were not initialized from the model checkpoint at zhihan1996/DNABERT-2-117M and are newly initialized: ['bert.pooler.dense.weight', 'bert.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


2

#### Freeze gradients

In [25]:
# unfreeze the last layer in the encoder block
for name, param in model.named_parameters():
    if "encoder.layer.11" in name:
        param.requires_grad = True
    else:
        param.requires_grad = False

In [None]:
#change classification head of the model
#reference: https://discuss.huggingface.co/t/how-do-i-change-the-classification-head-of-a-model/4720/3
class BinaryDNABERT2Model(nn.Module):
    def __init__(self):
        super(BinaryDNABERT2Model, self).__init__()

        self.base_model = AutoModel.from_pretrained("zhihan1996/DNABERT-2-117M", trust_remote_code=True, config=config).to(device)
        self.dropout = nn.Dropout(0.5)
        self.linear = nn.Linear(768, 2) # output features from bert is 768 and 2 is number of labels
        
    def forward(self, input_ids, attn_mask):
        outputs = self.base_model(input_ids, attention_mask=attn_mask)
        # You write you new head here
        outputs = self.dropout(outputs[0])
        outputs = self.linear(outputs)
        
        return outputs

In [42]:
print(model)

BertModel(
  (embeddings): BertEmbeddings(
    (word_embeddings): Embedding(4096, 768, padding_idx=0)
    (token_type_embeddings): Embedding(2, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): BertEncoder(
    (layer): ModuleList(
      (0-11): 12 x BertLayer(
        (attention): BertUnpadAttention(
          (self): BertUnpadSelfAttention(
            (dropout): Dropout(p=0.0, inplace=False)
            (Wqkv): Linear(in_features=768, out_features=2304, bias=True)
          )
          (output): BertSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
        )
        (mlp): BertGatedLinearUnitMLP(
          (gated_layers): Linear(in_features=768, out_features=6144, bias=False)
          (act): GELU(approximate='none')
  

#### Load data from Hugging Face

In [50]:
class custom_data_load(torch.utils.data.Dataset):
    def __init__(self, dataframe, tokenizer, shuffle=True):
        if shuffle:
            self.dataframe = dataframe.sample(frac=1).reset_index(drop=True)  # shuffle the dataframe
        else:
            self.dataframe = dataframe
        self.tokenizer = tokenizer

    def __len__(self):
        return len(self.dataframe)

    def __getitem__(self, idx):
        sequence = self.dataframe.iloc[idx]['sequence']
        label = self.dataframe.iloc[idx]['label']

        # tokenize the sequence
        # tokenizer automatically generates attention masks
        #DNABERT-2 max length = 128?
        inputs = self.tokenizer(sequence, padding='max_length', max_length=128, truncation=True, return_tensors='pt')
        
        # move inputs to gpu
        inputs = {key: value.to(device) for key, value in inputs.items()}

        return {
            'input_ids': inputs['input_ids'].squeeze(),
            'attention_mask': inputs['attention_mask'].squeeze(),
            'labels': torch.tensor(label, dtype=torch.long)
        }

In [51]:
def fasta_to_csv(fasta_file, csv_file):
    with open(fasta_file, 'r') as fasta_fh, open(csv_file, 'w', newline='') as csv_fh:
        writer = csv.writer(csv_fh)
        writer.writerow(['sequence', 'label'])
        for record in SeqIO.parse(fasta_fh, 'fasta'):
            label = record.id.split('|')[-1]
            sequence = str(record.seq)
            writer.writerow([sequence, label])

In [52]:
fasta_file = f'{project_dir}/data/H3K4me3/train.fna'
csv_file =f'{project_dir}/data/train.csv'
fasta_to_csv(fasta_file, csv_file)

In [53]:
fasta_file = f'{project_dir}/data/H3K4me3/test.fna'
csv_file = f'{project_dir}/data/test.csv'
fasta_to_csv(fasta_file, csv_file)

#### load in train and test data

In [54]:
train_data = pd.read_csv(f'{project_dir}/data/train.csv')
train_ds = custom_data_load(dataframe = train_data, tokenizer = tokenizer)

In [58]:
test_data = pd.read_csv(f'{project_dir}/data/test.csv')
test_ds = custom_data_load(dataframe = test_data, tokenizer = tokenizer)

In [57]:
train_ds.__getitem__(0)

{'input_ids': tensor([   1,   41, 1019,   25, 1518,   14,  409, 1132,   42,   37,  444,   29,
          119,  129,  183,  391,  504,  147,   23,  127,   43,  265,  121,  194,
           33,  952,   33,  700,  202,  104,  159,   44,   94,   43,  140, 1641,
          340,  930, 2081,  299, 3896,  860,  262,  116,  364,  114,  384,   47,
          267, 1676,   70,  170,   45,  331,  114,  102,   29,  196,   72, 1655,
          660,   22,  105,   36, 1460,   35,  172,  712, 1500,  358, 3780,  590,
          495,  315,   36, 1492, 3527, 2139,  435,   38,   16,  250,   55,  703,
          443,   21,  674,   35,  732, 3754,   43,  232,  252,   36,  353, 2559,
          273,  318, 3537,  124,   60, 1760,    7,    2,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0], device='cuda:0'),
 'attention_mask': tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
  

#### training arguments

In [None]:
from transformers import TrainingArguments

training_args = TrainingArguments(
        per_device_train_batch_size=16, #powers of 2
        gradient_accumulation_steps=2,
        gradient_checkpointing=True,
        dataloader_num_workers=4,
        
        fp16=True, 
        weight_decay=0.015, 
        num_train_epochs=2, 
        log_level="error",
        output_dir=model_out_dir,
        evaluation_strategy="steps",  
        eval_steps=800, 
         save_strategy="epoch",        
         logging_steps=800,
         logging_strategy="steps",
         logging_dir=logging_dir,
         overwrite_output_dir = True,
    )

#### metrics to compute

In [None]:
#or can define custom metrics function
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)
    

#### Trainer

In [None]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_ds,
    eval_dataset=test_ds,
    compute_metrics=compute_metrics,
)

trainer.train()