# Make dictionary

In [1]:
import pandas as pd
import torch
from transformers import (
    AutoTokenizer, 
    AutoModel, 
    AutoModelForSequenceClassification, 
    DataCollatorForTokenClassification, 
    DataCollatorWithPadding,
    TrainingArguments,
    Trainer, 
    DataCollatorForLanguageModeling
)
from sklearn.metrics import (
    accuracy_score,
    precision_score,
    recall_score,
    f1_score
)
from transformers.models.bert.configuration_bert import BertConfig 
import numpy as np
from datasets import load_dataset, Dataset
from torch.utils.data import DataLoader
from sklearn.model_selection import train_test_split
#from training_args_module import training_args
import transformers
import os
import csv
from Bio import SeqIO
import argparse
import json
import pickle

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
princeton_id = 'aa8417'
project_dir = f'/scratch/gpfs/{princeton_id}/QCB557_project'

# use gpu
device = 'cuda:0' if torch.cuda.is_available() else 'cpu'

model_out_dir = '/scratch/gpfs/aa8417/QCB557_project/models/replicate_043024/rep1/fine_tune_parallel_v1'

In [3]:
config_seq = BertConfig.from_pretrained(f'/scratch/gpfs/{princeton_id}/QCB557_project/models/replicate_043024/rep1/fine_tune_parallel_v1/config.json', output_attentions=True)
print(config_seq.num_labels) #2 labels
model_seq = AutoModelForSequenceClassification.from_pretrained(model_out_dir, trust_remote_code=True, config=config_seq)
model_seq.to(device)

2


BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(4096, 768, padding_idx=0)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertUnpadAttention(
            (self): BertUnpadSelfAttention(
              (dropout): Dropout(p=0.0, inplace=False)
              (Wqkv): Linear(in_features=768, out_features=2304, bias=True)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
          )
          (mlp): BertGatedLinearUnitMLP(
            (gated_layers): Linear(in_features=768

In [4]:
config_base = BertConfig.from_pretrained("zhihan1996/DNABERT-2-117M", output_hidden_states=True, output_attentions=True)
print(config_base.num_labels) #2 labels
model_base = AutoModel.from_pretrained("zhihan1996/DNABERT-2-117M", trust_remote_code=True, config=config_base)
model_base.to(device)

2


Some weights of the model checkpoint at zhihan1996/DNABERT-2-117M were not used when initializing BertModel: ['cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertModel were not initialized from the model checkpoint at zhihan1996/DNABERT-2-117M and are newly initialized: ['bert.pooler.dense.weight', 'bert.pooler.dense.bias']
You should prob

BertModel(
  (embeddings): BertEmbeddings(
    (word_embeddings): Embedding(4096, 768, padding_idx=0)
    (token_type_embeddings): Embedding(2, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): BertEncoder(
    (layer): ModuleList(
      (0): BertLayer(
        (attention): BertUnpadAttention(
          (self): BertUnpadSelfAttention(
            (dropout): Dropout(p=0.0, inplace=False)
            (Wqkv): Linear(in_features=768, out_features=2304, bias=True)
          )
          (output): BertSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
        )
        (mlp): BertGatedLinearUnitMLP(
          (gated_layers): Linear(in_features=768, out_features=6144, bias=False)
          (act): GELU(approximate='none')
          

In [5]:
tokenizer = AutoTokenizer.from_pretrained("zhihan1996/DNABERT-2-117M", trust_remote_code=True, padding=True)
tokenizer.pad_token = "X"

In [6]:
test = pd.read_csv(f'/scratch/gpfs/{princeton_id}/QCB557_project/data/H3K4me3/test.csv')
train = pd.read_csv(f'/scratch/gpfs/{princeton_id}/QCB557_project/data/H3K4me3/train.csv')
full = pd.concat([train, test], ignore_index=True)
full.to_csv(f'/scratch/gpfs/{princeton_id}/QCB557_project/data/H3K4me3/all_seqs.csv')

In [7]:
def get_model_output(model_base, model_seq, tokenizer, dataframe, device):
    results_dict = {}
    counter = 0

    for index, row in dataframe.iterrows():
        sequence = row['sequence']
        label = row['label']
        
        inputs = tokenizer(sequence, padding='max_length', max_length=128, return_tensors='pt').to(device)
        input_ids = inputs['input_ids'].to(device)
        attention_mask = inputs['attention_mask'].to(device)

        with torch.no_grad():
            outputs_seq = model_seq(input_ids=input_ids, attention_mask=attention_mask, output_attentions=True)
            hidden_states = outputs_seq.hidden_states
        
        with torch.no_grad():
            outputs_base = model_base(input_ids=input_ids, attention_mask=attention_mask, output_attentions=True)
            attention_weights = outputs_base[1]

        key = f'seq_{index}_{label}'
        
        results_dict[key] = {
            "sequence": sequence,
            "input_ids": input_ids.cpu().detach().numpy().tolist(),
            "hidden_states": [tensor.cpu().detach().numpy() for tensor in hidden_states],
            "attention_weights": attention_weights.cpu().detach().numpy().tolist()
        }
        
        counter += 1
        percent = (counter/36799)*100
        
        if counter % 1000 == 0:
            print(f"{percent}% complete!")

    return results_dict

In [8]:
results_dict = get_model_output(model_base, model_seq, tokenizer, full, device)

2.7174651485094703% complete!
5.434930297018941% complete!
8.152395445528411% complete!
10.869860594037881% complete!
13.587325742547351% complete!
16.304790891056822% complete!
19.022256039566294% complete!
21.739721188075762% complete!
24.45718633658523% complete!
27.174651485094703% complete!
29.892116633604175% complete!
32.609581782113644% complete!
35.32704693062311% complete!
38.04451207913259% complete!
40.761977227642056% complete!
43.479442376151525% complete!
46.19690752466099% complete!
48.91437267317046% complete!
51.63183782167994% complete!
54.349302970189406% complete!
57.066768118698874% complete!
59.78423326720835% complete!
62.50169841571782% complete!
65.21916356422729% complete!
67.93662871273676% complete!
70.65409386124622% complete!
73.3715590097557% complete!
76.08902415826518% complete!
78.80648930677464% complete!
81.52395445528411% complete!
84.24141960379357% complete!
86.95888475230305% complete!
89.67634990081253% complete!
92.39381504932199% complete!
95

In [10]:
def save_to_pickle(results_dict, file_path):
    with open(file_path, 'wb') as pickle_file:
        pickle.dump(results_dict, pickle_file)

file_path = f'/scratch/gpfs/{princeton_id}/QCB557_project/data/H3K4me3/results_dict.pkl'
save_to_pickle(results_dict, file_path)

In [11]:
import pickle

def load_from_pickle(file_path):
    with open(file_path, 'rb') as pickle_file:
        return pickle.load(pickle_file)

file_path = f'/scratch/gpfs/{princeton_id}/QCB557_project/data/H3K4me3/results_dict.pkl'
results_dict = load_from_pickle(file_path)

# Printing the first and last key content
first_key = next(iter(results_dict))
last_key = list(results_dict.keys())[-1]

print("First key content:")
print(results_dict[first_key])

print("\nLast key content:")
print(results_dict[last_key])

First key content:
{'sequence': 'ACTACAACTGTTTAGCTACTAACTTGCTTGACATCCAAAGAGCAGGCCAATCACTAGAGAATGTCTGGGATCACTCAGATGGGTACACACAAGGCAGTGGATGTGGTTGTTACCGCCACGACTAGGGTCACTCTCTACCGTGAATTTCGAAATTTCGCGAACTGAGTGCGCCAGCTAGCCATTGCTACCAAGCTAACAAAAGGATCAGGCTGCCCAAACGGACGTAGACTCACTGGGCTCCGTCTAGAGAATGACACGAAACCTCACTCGTTGTTTCCTTCCGAGTGATTAACTAGACGGGAATACTGCCTGGAAATTTTTTCGGAAGAGAAAGTTTCGAATTTTAGTTATTCAGTTTCCCGAAAAAGGAATAAAATATGTACAGATGTAGTTGATCAAAAAAAAGTCTATGGACATCATTCCTACTGAAGGCACAGCTAGTACCTTATTTATTGCTGATGCTTTAGCAGATTTAATCAACTGTCAACAGCAGTGATATT', 'input_ids': [[1, 5, 770, 2084, 105, 79, 1645, 100, 388, 208, 1113, 759, 2059, 54, 1152, 63, 83, 138, 2242, 112, 483, 42, 2222, 3305, 93, 681, 67, 2032, 13, 1276, 16, 286, 685, 2350, 30, 504, 105, 475, 105, 76, 233, 96, 1152, 189, 652, 174, 194, 933, 277, 78, 41, 2039, 414, 409, 190, 40, 395, 130, 13, 875, 84, 37, 846, 25, 236, 74, 45, 122, 446, 299, 31, 480, 69, 77, 115, 736, 2121, 57, 58, 981, 55, 106, 512, 195, 39, 2302, 100, 127, 114, 119, 912, 105, 1591, 