# Fine-tuning script

*Anusha, Joyce, Nina*

#### Imports

In [64]:
import pandas as pd
import torch
from transformers import (
    AutoTokenizer, 
    AutoModel, 
    AutoModelForSequenceClassification, 
    DataCollatorForTokenClassification, 
    DataCollatorWithPadding,
    TrainingArguments, 
    Trainer, 
    DataCollatorForLanguageModeling
)
from transformers.models.bert.configuration_bert import BertConfig 
import numpy as np
from datasets import load_dataset, Dataset
from torch.utils.data import DataLoader
from sklearn.model_selection import train_test_split
import transformers
#import Dataset
import os

import csv
from Bio import SeqIO

#os.environ['WANDB_NOTEBOOK_NAME'] = 'fine-tune'

In [65]:
# Add ids and comment out others when using
#princeton_id = 'aa8417'
princeton_id = 'ns5404'
#princeton_id = 'jf...'

project_dir = f'/scratch/gpfs/{princeton_id}/QCB557_project'

model_name = 'fine_tune_v1'
model_out_dir = f'{project_dir}/models/{model_name}'

#### Load model and tokenizer from Hugging Face

In [67]:
# use gpu
device = 'cuda:0' if torch.cuda.is_available() else 'cpu'
print(device)

config = BertConfig.from_pretrained("zhihan1996/DNABERT-2-117M")
print(config.num_labels) #2 labels
model = AutoModelForSequenceClassification.from_pretrained("zhihan1996/DNABERT-2-117M", trust_remote_code=True, config=config)

# don't need to move the tokenizer to gpu b/c it's light
# use data collator to pad sequences dynamically during training
tokenizer = AutoTokenizer.from_pretrained("zhihan1996/DNABERT-2-117M", trust_remote_code=True, padding=True)
tokenizer.pad_token = "X"

cuda:0
2


print(model)

#### Freeze and unfreeze gradients

We can play around with this later as we imrove the performance of the fine-tuned model.

In [68]:
# default when using AutoModelForSequenceClassification is that all pretrained parameters/layers are frozen except classification head
# as we go through rounds of fine-tuning, we can optionally unfreeze from of the pretrained layers to improve performance

# unfreeze the last layer in the encoder block
for name, param in model.named_parameters():
    if "encoder.layer.11" in name:
        param.requires_grad = True
    else:
        param.requires_grad = False

In [69]:
model.to(device)

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(4096, 768, padding_idx=0)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertUnpadAttention(
            (self): BertUnpadSelfAttention(
              (dropout): Dropout(p=0.0, inplace=False)
              (Wqkv): Linear(in_features=768, out_features=2304, bias=True)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
          )
          (mlp): BertGatedLinearUnitMLP(
            (gated_layers): Linear(in_features=768

#### Load data from Hugging Face

In [70]:
class custom_data_load(torch.utils.data.Dataset):
    def __init__(self, dataframe, tokenizer, shuffle=True):
        if shuffle:
            self.dataframe = dataframe.sample(frac=1).reset_index(drop=True)  # shuffle the dataframe
        else:
            self.dataframe = dataframe
        self.tokenizer = tokenizer

    def __len__(self):
        return len(self.dataframe)

    def __getitem__(self, idx):
        sequence = self.dataframe.iloc[idx]['sequence']
        #print(sequence)
        #print(len(sequence))
        label = (self.dataframe.iloc[idx]['label'])
        #print(label)

        # tokenize the sequence
        # tokenizer automatically generates attention masks
        #inputs = self.tokenizer(sequence, padding='max_length', max_length=500, truncation=True, return_tensors='pt')
        inputs = self.tokenizer(sequence, padding='max_length', max_length=128, return_tensors='pt')
        #print(inputs)
        
        # move inputs to gpu
        #inputs = {key: value.to(device) for key, value in inputs.items()}

        return {
            'input_ids': inputs['input_ids'].squeeze(),
            'attention_mask': inputs['attention_mask'].squeeze(),
            'labels': (torch.tensor([label], dtype=torch.long))
            #'labels': torch.tensor([int(label)]) 
        }

#### Load train and test data

In [71]:
train_data_to_split = pd.read_csv(f'{project_dir}/data/train.csv')
train_data_to_split['label'] = train_data_to_split['label'].astype(int)

In [72]:
test_data = pd.read_csv(f'{project_dir}/data/test.csv')
test_data['label'] = test_data['label'].astype(int)

test_ds = custom_data_load(dataframe = test_data, tokenizer = tokenizer, shuffle=False)

#### Split training into training and validation

In [73]:
train_data, valid_data = train_test_split(train_data_to_split, test_size=0.1, random_state=42)

In [74]:
train_ds = custom_data_load(dataframe = train_data, tokenizer = tokenizer, shuffle=True)
valid_ds = custom_data_load(dataframe = valid_data, tokenizer = tokenizer, shuffle=False)

In [75]:
train = DataLoader(train_ds, pin_memory=True)
valid = DataLoader(valid_ds, pin_memory=True)

#### Data collator

In [76]:
data_collator = DataCollatorForTokenClassification(tokenizer, label_pad_token_id='X')
#data_collator = DataCollatorWithPadding(tokenizer)
#data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

#### Training arguments

In [77]:
training_args = TrainingArguments(
    output_dir = model_out_dir,
    num_train_epochs= 10, 
    per_device_train_batch_size=8, # powers of 2
    weight_decay=0.015, # regularization
    learning_rate=1e-5,
    gradient_accumulation_steps=2, # how many batches of gradients are accumulated before parameter update
    #gradient_checkpointing=True, # helps reduce memory
    #dataloader_num_workers=4,
    
    log_level="error",
    evaluation_strategy="steps",  
    eval_steps=500,        
    logging_steps=500,
    logging_strategy="steps",
    save_strategy="no", # don't want to save checkpoints -- takes up too much space, can change later
    fp16=True,
    #dataloader_pin_memory=True
    )

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


#### Extra metrics to compute with validation data

In [82]:
#need to fix this, doesn't work with current prediction + label format
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    
    accuracy = accuracy_score(labels, predictions)
    precision = precision_score(labels, predictions)
    recall = recall_score(labels, predictions)
    f1 = f1_score(labels, predictions)
    
    return {'accuracy': accuracy, 'precision': precision, 'recall': recall, 'f1': f1}

#### Train the model

In [None]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_ds,
    eval_dataset=valid_ds,
    #compute_metrics=compute_metrics
    #data_collator = data_collator
)

trainer.train()
trainer.save_model(training_args.output_dir)

{'loss': 0.6369, 'learning_rate': 9.73161567364466e-06, 'epoch': 0.27}
{'eval_loss': 0.666924238204956, 'eval_runtime': 5.7938, 'eval_samples_per_second': 571.641, 'eval_steps_per_second': 71.455, 'epoch': 0.27}


In [None]:
log_df = pd.DataFrame(trainer.state.log_history)
log_df.to_csv(f'{project_dir}/model_output/log_{model_name}.csv', index=False)

#### Evaluate the model

In [None]:
# need to fix this part

model.eval()

predictions = []
true_labels = []

# go through test dataset, make predictions and store them
for idx, sample in enumerate(test_ds):
    with torch.no_grad():
        input_ids = sample['input_ids'].unsqueeze(0).to(device)
        attention_mask = sample['attention_mask'].unsqueeze(0).to(device)
        outputs = model(input_ids=input_ids, attention_mask=attention_mask)

        predicted_probs = torch.softmax(outputs.logits, dim=-1) # might not need this part
        predicted_labels = torch.argmax(predicted_probs, dim=-1)

        predictions.append(predicted_labels)
        true_labels.append(sample['labels'].item())

# concatenate predictions and convert true_labels to numpy array
predictions = torch.cat(predictions).cpu().numpy()
true_labels = np.array(true_labels)

# save evaluation results to CSV
results_df = pd.DataFrame({'true_labels': true_labels, 'predicted_labels': predictions})
results_df.to_csv(f'{project_dir}/model_output/results_{model_name}.csv', index=False)

In [22]:
train_ds[2100]['input_ids'].shape

torch.Size([500])

In [108]:
train_ds[2000]['input_ids']

tensor([   1,    5,   47,  151, 1519,  120, 1178, 3648,   33,  357,   29,  584,
        4022,  268,   28,  139,   34,  131,  255, 2191,  830,  115,  130,   41,
          31,  381,   20,  118, 2940,  427,   84,   61, 1340,   41,  875,  454,
          47,  113,  663,   47,   45,   89,  851,  105,   26,  196,  392,  357,
         404,  246,   65, 2870, 1817,  655,   46, 3334,  457,  380, 1113,   28,
         673, 3765,  188,  820,  425,   10,   64,  950,  119,  985, 1372,   88,
         621, 1230,   30,  120, 2044,  211,  123,   28,   19,  330,  244,   62,
        3910,  210,   82,   64,  162,  118,   75,  206,  190,  353,  405,   69,
          47,  399, 1682,   22,  920,   19,    2,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0], device='cuda:0')

In [120]:
train_ds[0]

ACCCGTTATTACTACCGCTGAAACCAAACATGTGATCAATTGGTCAACAAGGTATGTCTTTTGTTACTAACGAGCGTTTCTGGAATAGTTTGAACGGGATAATCCTCGATTTAGTCTTTCAGTCCCTTTGGCAAAAAGTTGTTTACATCGACTTGGAGAGTTGGACTTTTGCTTTTACTCATTTCAAGAAACAAAATTTTTGAAAATAATCCATTTCGCGTACGTTCAAAATGATATAATTACACATTGGCAACAATTATTTAGATTATATCACGTAGCTATTGTACTATTTTCCTATGAGAGAACTGCTGTAGCCATGTGGGATGATCATGCGGGCAAAACTAAAATACTTATATCAGTCCACAATTGAAGTATGGAAAGTTTTCTCCCAACTATGCCGTTTTTATAATGGAGACTAATGTCAAAGATAGGTTTTGCAGGCTTGATGGGTTTTCAAGACACTTTTCGGTCAAGATTTTCAGTGAGATTTGAAATTCTTG
500
1
{'input_ids': tensor([[   1,    5,   13,  495, 1148,   13,  978,  131,   65,   23,  379, 1153,
          303,   70,  167,   77,   79,   93,  764, 1532,  237,   52,   72,  250,
           26, 1272,   14,  161, 2897,  157,  707,  395,   80,   16,  226,   33,
          222,   33,   91, 1198,  432,   56,   45,   96,  311, 3921,  356,  685,
           35,  134,   96,   23, 3939,  352,  328,  581,   94,  260,  211,  194,
          612,   35,  854,  114,   23,  145,  374, 1991,   42,  914,  224,

{'input_ids': tensor([   1,    5,   13,  495, 1148,   13,  978,  131,   65,   23,  379, 1153,
          303,   70,  167,   77,   79,   93,  764, 1532,  237,   52,   72,  250,
           26, 1272,   14,  161, 2897,  157,  707,  395,   80,   16,  226,   33,
          222,   33,   91, 1198,  432,   56,   45,   96,  311, 3921,  356,  685,
           35,  134,   96,   23, 3939,  352,  328,  581,   94,  260,  211,  194,
          612,   35,  854,  114,   23,  145,  374, 1991,   42,  914,  224,   72,
          291,  280, 1178,  211,  553,  162,   25,  312,   45,   87,   78,  961,
          139,   87, 1981,   50,  806,  107, 1462,   21,  253, 2660,  568, 2707,
           91,   72,   56,  284,  721,   73, 1079,   29,    7,    2,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0], device='cuda:0'),
 'attention_mask': tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
  

### test sequence for debugging purposes

In [2]:
import pandas as pd
import torch
from transformers import (
    AutoTokenizer, 
    AutoModel, 
    AutoModelForSequenceClassification, 
    DataCollatorForTokenClassification, 
    DataCollatorWithPadding,
    TrainingArguments, 
    Trainer, 
    DataCollatorForLanguageModeling
)
from transformers.models.bert.configuration_bert import BertConfig 
import numpy as np
from datasets import load_dataset, Dataset
from torch.utils.data import DataLoader

device = 'cuda:0' if torch.cuda.is_available() else 'cpu'

config = BertConfig.from_pretrained("zhihan1996/DNABERT-2-117M")
model = AutoModel.from_pretrained("zhihan1996/DNABERT-2-117M", trust_remote_code=True, config=config)
model.to(device)
tokenizer = AutoTokenizer.from_pretrained("zhihan1996/DNABERT-2-117M", trust_remote_code=True, padding=True)

sequence = 'CTCTTTTTTGATGCGAATACCTGTTGTGGCGGAAACTGAAGTCCCTGCCCATCAGCTGTTCGACGAGTTTGCGTTGTTCTGCGGCAGGCGTTGACATAGTTGACATTCTGACGGTAGTTGGCAGCTTTCTGCTGGGAGTGTAGAGCCTTTTGTAGTATAACTTTTTGTTCTTTTCTTTCTTCTCTAATGCCTGAGGCTTCGGCGGTTATTCGGGTAATACATTAAGGAAGTTGCCATGATGTGGAAGAATACGACTAGTCAGTTAGCGATGGCCAGCTCCTTTACCTAAATCATGTGGCCTATCTTCAGATAGCATACTACCACCAACCATCAATACCTCAATGGCTTTCAACAAGTACCCTTCGTCGGGTTTTCAGCTTTTTTCCTACTTTGCTGGATGCTTTGTGAAGATCAGGAAGAAATTTTATCGGAAAAACCTCACTAGGAGAAACTATATAAAAGAAGGGTCAGACTGAAAATGACGCCGTAATAAACTCGTG'
label = 1

inputs = tokenizer(sequence, return_tensors="pt", padding=True, truncation=True)
inputs = {key: value.to(device) for key, value in inputs.items()}

Some weights of the model checkpoint at zhihan1996/DNABERT-2-117M were not used when initializing BertModel: ['cls.predictions.decoder.weight', 'cls.predictions.decoder.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertModel were not initialized from the model checkpoint at zhihan1996/DNABERT-2-117M and are newly initialized: ['bert.pooler.dense.weight', 'bert.pooler.dense.bias']
You should prob

In [11]:
device

'cuda:0'

In [3]:
with torch.no_grad():
    outputs = model(**inputs)

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [9]:
outputs[0]

tensor([[[-1.2831e-01, -1.1100e-01,  1.0213e-01,  ...,  2.8361e-01,
           3.9690e-01,  1.6078e-01],
         [ 1.8397e-01, -5.9070e-02, -6.9412e-02,  ...,  3.3905e-01,
           3.8382e-02, -1.0143e-01],
         [-2.5581e-01,  3.2368e-02,  1.8041e-02,  ...,  3.4424e-02,
           1.5477e-01,  2.2153e-01],
         ...,
         [-2.3947e-02, -1.0618e-01,  1.1523e-01,  ...,  2.7422e-01,
           6.7521e-03, -1.6672e-01],
         [ 2.1788e-01,  5.5807e-01, -1.8823e-02,  ..., -8.2555e-02,
           1.2954e-01,  9.5781e-02],
         [ 2.8829e-04,  3.0053e-01,  7.4851e-02,  ...,  2.3679e-01,
           3.5851e-01,  1.4315e-01]]], device='cuda:0')