# Fine-tuning script

*Anusha, Joyce, Nina*

#### Imports

In [85]:
import pandas as pd
import torch
from transformers import (
    AutoTokenizer, 
    AutoModel, 
    AutoModelForSequenceClassification, 
    DataCollatorForTokenClassification, 
    DataCollatorWithPadding,
    TrainingArguments, 
    Trainer, 
    DataCollatorForLanguageModeling
)
from transformers.models.bert.configuration_bert import BertConfig 
import numpy as np
from datasets import load_dataset, Dataset
from torch.utils.data import DataLoader
from sklearn.model_selection import train_test_split
import transformers
#import Dataset

import csv
from Bio import SeqIO

In [3]:
# Add ids and comment out others when using
princeton_id = 'aa8417'
#princeton_id = 'ns...'
#princeton_id = 'jf...'

project_dir = f'/scratch/gpfs/{princeton_id}/QCB557_project'

model_name = 'fine_tune_v1'
model_out_dir = f'{project_dir}/models/{model_name}'

#### Load model and tokenizer from Hugging Face

In [4]:
# use gpu
device = 'cuda:0' if torch.cuda.is_available() else 'cpu'

config = BertConfig.from_pretrained("zhihan1996/DNABERT-2-117M")
config.num_labels #2 labels
model = AutoModelForSequenceClassification.from_pretrained("zhihan1996/DNABERT-2-117M", trust_remote_code=True, config=config)

# don't need to move the tokenizer to gpu b/c it's light
# use data collator to pad sequences dynamically during training
tokenizer = AutoTokenizer.from_pretrained("zhihan1996/DNABERT-2-117M", trust_remote_code=True, padding=True)
tokenizer.pad_token = "X"

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at zhihan1996/DNABERT-2-117M and are newly initialized: ['classifier.weight', 'bert.pooler.dense.bias', 'classifier.bias', 'bert.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


print(model)

#### Freeze and unfreeze gradients

We can play around with this later as we imrove the performance of the fine-tuned model.

In [5]:
# default when using AutoModelForSequenceClassification is that all pretrained parameters/layers are frozen except classification head
# as we go through rounds of fine-tuning, we can optionally unfreeze from of the pretrained layers to improve performance

# unfreeze the last layer in the encoder block
for name, param in model.named_parameters():
    if "encoder.layer.11" in name:
        param.requires_grad = True
    else:
        param.requires_grad = False

In [6]:
model.to(device)

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(4096, 768, padding_idx=0)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertUnpadAttention(
            (self): BertUnpadSelfAttention(
              (dropout): Dropout(p=0.0, inplace=False)
              (Wqkv): Linear(in_features=768, out_features=2304, bias=True)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
          )
          (mlp): BertGatedLinearUnitMLP(
            (gated_layers): Linear(in_feat

#### Load data from Hugging Face

In [86]:
class custom_data_load(torch.utils.data.Dataset):
    def __init__(self, dataframe, tokenizer, shuffle=True):
        if shuffle:
            self.dataframe = dataframe.sample(frac=1).reset_index(drop=True)  # shuffle the dataframe
        else:
            self.dataframe = dataframe
        self.tokenizer = tokenizer

    def __len__(self):
        return len(self.dataframe)

    def __getitem__(self, idx):
        sequence = self.dataframe.iloc[idx]['sequence']
        label = (self.dataframe.iloc[idx]['label'])

        # tokenize the sequence
        # tokenizer automatically generates attention masks
        #inputs = self.tokenizer(sequence, padding='max_length', max_length=500, truncation=True, return_tensors='pt')
        inputs = self.tokenizer(sequence, padding='max_length', max_length=500, return_tensors='pt')

        
        # move inputs to gpu
        inputs = {key: value.to(device) for key, value in inputs.items()}

        return {
            'input_ids': inputs['input_ids'].squeeze(),
            'attention_mask': inputs['attention_mask'].squeeze(),
            'labels': (torch.tensor([label], dtype=torch.long)).to(device)
            #'labels': torch.tensor([int(label)]) 
        }

#### Load train and test data

In [87]:
train_data_to_split = pd.read_csv(f'{project_dir}/data/train.csv')
train_data_to_split['label'] = train_data_to_split['label'].astype(int)

In [88]:
test_data = pd.read_csv(f'{project_dir}/data/test.csv')
test_data['label'] = test_data['label'].astype(int)

test_ds = custom_data_load(dataframe = test_data, tokenizer = tokenizer, shuffle=False)

#### Split training into training and validation

In [89]:
train_data, valid_data = train_test_split(train_data_to_split, test_size=0.1, random_state=42)

In [90]:
train_ds = custom_data_load(dataframe = train_data, tokenizer = tokenizer, shuffle=True)
valid_ds = custom_data_load(dataframe = valid_data, tokenizer = tokenizer, shuffle=False)

In [67]:
train = DataLoader(train_ds, pin_memory=True)
valid = DataLoader(valid_ds, pin_memory=True)

#### Data collator

In [78]:
data_collator = DataCollatorForTokenClassification(tokenizer, label_pad_token_id='X')
#data_collator = DataCollatorWithPadding(tokenizer)
#data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

#### Training arguments

In [79]:
training_args = TrainingArguments(
    output_dir = model_out_dir,
    num_train_epochs= 10, 
    per_device_train_batch_size=8, # powers of 2
    weight_decay=0.015, # regularization
    learning_rate=1e-5,
    gradient_accumulation_steps=2, # how many batches of gradients are accumulated before parameter update
    #gradient_checkpointing=True, # helps reduce memory
    #dataloader_num_workers=4,
    
    log_level="error",
    evaluation_strategy="steps",  
    eval_steps=500,        
    logging_steps=500,
    logging_strategy="steps",
    save_strategy="no", # don't want to save checkpoints -- takes up too much space, can change later
    fp16=True,
    dataloader_pin_memory=True
    )

#### Extra metrics to compute with validation data

In [80]:
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    
    accuracy = accuracy_score(labels, predictions)
    precision = precision_score(labels, predictions)
    recall = recall_score(labels, predictions)
    f1 = f1_score(labels, predictions)
    
    return {'accuracy': accuracy, 'precision': precision, 'recall': recall, 'f1': f1}

#### Train the model

In [91]:
# need to fix the bugs here -- thought I already fixed this???

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_ds,
    eval_dataset=valid_ds,
    compute_metrics=compute_metrics
    #data_collator = data_collator
)

trainer.train()
trainer.save_model(training_args.output_dir)

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False)
Detected kernel version 4.18.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


ArrowInvalid: Column 2 named labels expected length 500 but got length 1

In [None]:
log_df = pd.DataFrame(trainer.state.log_history)
log_df.to_csv(f'{project_dir}/model_output/log_{model_name}.csv', index=False)

#### Evaluate the model

In [None]:
# need to fix this part

model.eval()

predictions = []
true_labels = []

# go through test dataset, make predictions and store them
for idx, sample in enumerate(test_ds):
    with torch.no_grad():
        input_ids = sample['input_ids'].unsqueeze(0).to(device)
        attention_mask = sample['attention_mask'].unsqueeze(0).to(device)
        outputs = model(input_ids=input_ids, attention_mask=attention_mask)

        predicted_probs = torch.softmax(outputs.logits, dim=-1) # might not need this part
        predicted_labels = torch.argmax(predicted_probs, dim=-1)

        predictions.append(predicted_labels)
        true_labels.append(sample['labels'].item())

# concatenate predictions and convert true_labels to numpy array
predictions = torch.cat(predictions).cpu().numpy()
true_labels = np.array(true_labels)

# save evaluation results to CSV
results_df = pd.DataFrame({'true_labels': true_labels, 'predicted_labels': predictions})
results_df.to_csv(f'{project_dir}/model_output/results_{model_name}.csv', index=False)

In [22]:
train_ds[2100]['input_ids'].shape

torch.Size([500])

In [23]:
train_ds[2000]['input_ids']

tensor([   1,  349, 1054,  197,   26,  987,  914,   66,  380,   43,   50, 1480,
         153, 1396,  355,   61, 1711,  245, 1711,  245,  256,   21, 3035, 1278,
         998,   17,  313,  276,  245,   43, 1109, 2473,  737,  108,  166,   47,
         541,   75,   56,  800,   21,  390,   60,   32,  132, 1120,   63,  158,
          80,  286,  852, 2142, 2789,  243,  303,  194,   17,  397,   97,   45,
        3213,  502,   53,   58,  358, 3444,  160,  602,   35,  299,   59, 1925,
          29,  552,  179, 1183,   76,  276,  153,  927,   78,  196,  145,   28,
         710,   56,   67,   55,  970,  142,  969, 1340, 2546, 1538,  116, 3884,
        2006,   47,    8,    2,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,   