# Fine-tuning script

*Anusha, Joyce, Nina*

#### Imports

In [1]:
import pandas as pd
import torch
from transformers import (
    AutoTokenizer, 
    AutoModel, 
    AutoModelForSequenceClassification, 
    DataCollatorForTokenClassification, 
    DataCollatorWithPadding,
    TrainingArguments, 
    Trainer, 
    DataCollatorForLanguageModeling
)
from transformers.models.bert.configuration_bert import BertConfig 
import numpy as np
from datasets import load_dataset, Dataset
from torch.utils.data import DataLoader
from sklearn.model_selection import train_test_split
import transformers
#import Dataset
import os

import csv
from Bio import SeqIO

from sklearn.metrics import (
    accuracy_score,
    precision_score,
    recall_score,
    f1_score
)

import matplotlib.pyplot as plt

#os.environ['WANDB_NOTEBOOK_NAME'] = 'fine-tune'

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# Add ids and comment out others when using
princeton_id = 'ns5404'
#princeton_id = 'ns5404'
#princeton_id = 'jf...'

project_dir = f'/scratch/gpfs/{princeton_id}/QCB557_project'

model_name = 'fine_tune_new_v8'
model_out_dir = f'{project_dir}/models/{model_name}'

#### Load model and tokenizer from Hugging Face

In [3]:
# use gpu
device = 'cuda:0' if torch.cuda.is_available() else 'cpu'
print(device)

config = BertConfig.from_pretrained("zhihan1996/DNABERT-2-117M")
print(config.num_labels) #2 labels
model = AutoModelForSequenceClassification.from_pretrained("zhihan1996/DNABERT-2-117M", trust_remote_code=True, config=config)

# don't need to move the tokenizer to gpu b/c it's light
# use data collator to pad sequences dynamically during training
tokenizer = AutoTokenizer.from_pretrained("zhihan1996/DNABERT-2-117M", trust_remote_code=True, padding=True)
tokenizer.pad_token = "X"

cuda:0
2


Some weights of the model checkpoint at zhihan1996/DNABERT-2-117M were not used when initializing BertForSequenceClassification: ['cls.predictions.decoder.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at zhihan1996/DNABERT-2-117M and are newly ini

In [None]:
print(model)

print(model)

#### Freeze and unfreeze gradients

We can play around with this later as we imrove the performance of the fine-tuned model.

In [None]:
# Freeze all parameters
for param in model.parameters():
    param.requires_grad = False

In [None]:
# default when using AutoModelForSequenceClassification is that all pretrained parameters/layers are frozen except classification head
# as we go through rounds of fine-tuning, we can optionally unfreeze from of the pretrained layers to improve performance

# unfreeze the last layer in the encoder block
for name, param in model.named_parameters():
    if "classifier" in name or "encoder.layer.11" in name:
        param.requires_grad = True
    else:
        param.requires_grad = False

In [None]:
#confirm classification head is frozen

print("Parameters with gradient enabled:")
for name, param in model.named_parameters():
    if param.requires_grad:
        print(name)
print("\nParameters with gradient disabled:")
for name, param in model.named_parameters():
    if not param.requires_grad:
        print(name)


In [None]:
model.to(device)

#### Load data from Hugging Face

In [4]:
class custom_data_load(torch.utils.data.Dataset):
    def __init__(self, dataframe, tokenizer, shuffle=True):
        if shuffle:
            self.dataframe = dataframe.sample(frac=1).reset_index(drop=True)  # shuffle the dataframe
        else:
            self.dataframe = dataframe
        self.tokenizer = tokenizer

    def __len__(self):
        return len(self.dataframe)

    def __getitem__(self, idx):
        sequence = self.dataframe.iloc[idx]['sequence']
        #print(sequence)
        #print(len(sequence))
        label = (self.dataframe.iloc[idx]['label'])
        #print(label)

        # tokenize the sequence
        # tokenizer automatically generates attention masks
        #inputs = self.tokenizer(sequence, padding='max_length', max_length=500, truncation=True, return_tensors='pt')
        inputs = self.tokenizer(sequence, padding='max_length', max_length=128, return_tensors='pt')
        #print(inputs)
        
        # move inputs to gpu
        #inputs = {key: value.to(device) for key, value in inputs.items()}

        return {
            'input_ids': inputs['input_ids'].squeeze(),
            'attention_mask': inputs['attention_mask'].squeeze(),
            'labels': (torch.tensor([label], dtype=torch.long))
            #'labels': torch.tensor([int(label)]) 
        }

#### Load train and test data

In [None]:
train_data_to_split = pd.read_csv(f'{project_dir}/data/train.csv')
train_data_to_split['label'] = train_data_to_split['label'].astype(int)

In [5]:
test_data = pd.read_csv(f'{project_dir}/data/test.csv')
test_data['label'] = test_data['label'].astype(int)

test_ds = custom_data_load(dataframe = test_data, tokenizer = tokenizer, shuffle=False)

#### Split training into training and validation

In [None]:
train_data, valid_data = train_test_split(train_data_to_split, test_size=0.1, random_state=42)

In [None]:
train_ds = custom_data_load(dataframe = train_data, tokenizer = tokenizer, shuffle=True)
valid_ds = custom_data_load(dataframe = valid_data, tokenizer = tokenizer, shuffle=False)

#### Data collator

In [None]:
data_collator = DataCollatorForTokenClassification(tokenizer, label_pad_token_id='X')
#data_collator = DataCollatorWithPadding(tokenizer)
#data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

#### Training arguments

In [6]:
training_args = TrainingArguments(
    output_dir = model_out_dir,
    num_train_epochs= 3, 
    per_device_train_batch_size=32, # powers of 2
    weight_decay=0.015, # regularization
    learning_rate=1e-5,
    gradient_accumulation_steps=2, # how many batches of gradients are accumulated before parameter update
    #gradient_checkpointing=True, # helps reduce memory
    #dataloader_num_workers=4,
    
    log_level="error",
    evaluation_strategy="epoch",  
    #eval_steps=500,        
    #logging_steps=500,
    logging_strategy="epoch",
    save_strategy="no", # don't want to save checkpoints -- takes up too much space, can change later
    fp16=True,
    #dataloader_pin_memory=True
    )

#### Extra metrics to compute with validation data

In [None]:
#need to fix this, doesn't work with current prediction + label format
def compute_metrics(eval_pred):
    logits,labels = eval_pred
    logits = torch.FloatTensor(logits[0])

    predicted_probs = torch.softmax(logits, dim=-1) # might not need this part
    predicted_labels = torch.argmax(predicted_probs, dim=-1)
    
    accuracy = accuracy_score(labels, predicted_labels)
    precision = precision_score(labels, predicted_labels)
    recall = recall_score(labels, predicted_labels)
    f1 = f1_score(labels, predicted_labels)
    
    return {'accuracy': accuracy, 'precision': precision, 'recall': recall, 'f1': f1}

#### Train the model

In [None]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_ds,
    eval_dataset=valid_ds,
    compute_metrics=compute_metrics
    #data_collator = data_collator
)

trainer.train()
trainer.save_model(training_args.output_dir)

In [None]:
log_df = pd.DataFrame(trainer.state.log_history)
log_df.to_csv(f'{project_dir}/model_output/log_{model_name}.csv', index=False)

#### Evaluate the model

In [8]:
# load fine-tuned model (the one that was just trained)
config = BertConfig.from_pretrained(f'{training_args.output_dir}/config.json')
print(config.num_labels) #2 labels
model = AutoModelForSequenceClassification.from_pretrained(training_args.output_dir, trust_remote_code=True, config=config)
model.to(device)

2


NameError: name 'device' is not defined

In [10]:
# load another fine-tuned model
model_name = 'fine_tune_new_v12'
model_load = f'{project_dir}/models/{model_name}'
config = BertConfig.from_pretrained(f'{model_load}/config.json')
model = AutoModelForSequenceClassification.from_pretrained(model_load, trust_remote_code=True, config=config)
model.to(device)

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(4096, 768, padding_idx=0)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertUnpadAttention(
            (self): BertUnpadSelfAttention(
              (dropout): Dropout(p=0.0, inplace=False)
              (Wqkv): Linear(in_features=768, out_features=2304, bias=True)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
          )
          (mlp): BertGatedLinearUnitMLP(
            (gated_layers): Linear(in_features=768

In [11]:
model.eval()

predictions = []
true_labels = []

# go through test dataset, make predictions and store them
for idx, sample in enumerate(test_ds):
    with torch.no_grad():
        input_ids = sample['input_ids'].unsqueeze(0).to(device)
        attention_mask = sample['attention_mask'].unsqueeze(0).to(device)
        outputs = model(input_ids=input_ids, attention_mask=attention_mask)

        predicted_probs = torch.softmax(outputs.logits, dim=-1) # might not need this part
        predicted_labels = torch.argmax(outputs.logits, dim=-1)

        predictions.append(predicted_labels)
        true_labels.append(sample['labels'].item())

# concatenate predictions and convert true_labels to numpy array
predictions = torch.cat(predictions).cpu().numpy()
true_labels = np.array(true_labels)

# save evaluation results to CSV
results_df = pd.DataFrame({'true_labels': true_labels, 'predicted_labels': predictions})
results_df.to_csv(f'{project_dir}/model_output/results_{model_name}.csv', index=False)

### test sequence for debugging purposes

In [None]:
import pandas as pd
import torch
from transformers import (
    AutoTokenizer, 
    AutoModel, 
    AutoModelForSequenceClassification, 
    DataCollatorForTokenClassification, 
    DataCollatorWithPadding,
    TrainingArguments, 
    Trainer, 
    DataCollatorForLanguageModeling
)
from transformers.models.bert.configuration_bert import BertConfig 
import numpy as np
from datasets import load_dataset, Dataset
from torch.utils.data import DataLoader

device = 'cuda:0' if torch.cuda.is_available() else 'cpu'

#config = BertConfig.from_pretrained("zhihan1996/DNABERT-2-117M")
#model = AutoModel.from_pretrained("zhihan1996/DNABERT-2-117M", trust_remote_code=True, config=config)
#model.to(device)
tokenizer = AutoTokenizer.from_pretrained("zhihan1996/DNABERT-2-117M", trust_remote_code=True, padding=True)

sequence = 'CTCTTTTTTGATGCGAATACCTGTTGTGGCGGAAACTGAAGTCCCTGCCCATCAGCTGTTCGACGAGTTTGCGTTGTTCTGCGGCAGGCGTTGACATAGTTGACATTCTGACGGTAGTTGGCAGCTTTCTGCTGGGAGTGTAGAGCCTTTTGTAGTATAACTTTTTGTTCTTTTCTTTCTTCTCTAATGCCTGAGGCTTCGGCGGTTATTCGGGTAATACATTAAGGAAGTTGCCATGATGTGGAAGAATACGACTAGTCAGTTAGCGATGGCCAGCTCCTTTACCTAAATCATGTGGCCTATCTTCAGATAGCATACTACCACCAACCATCAATACCTCAATGGCTTTCAACAAGTACCCTTCGTCGGGTTTTCAGCTTTTTTCCTACTTTGCTGGATGCTTTGTGAAGATCAGGAAGAAATTTTATCGGAAAAACCTCACTAGGAGAAACTATATAAAAGAAGGGTCAGACTGAAAATGACGCCGTAATAAACTCGTG'
label = 1

inputs = tokenizer(sequence, return_tensors="pt", padding=True, truncation=True)
inputs = {key: value.to(device) for key, value in inputs.items()}

In [None]:
with torch.no_grad():
    outputs = model(**inputs)