In [1]:
import torch
from transformers import AutoTokenizer, AutoModel
from torch.utils.data import Dataset
import pandas as pd
import selene_sdk
import numpy as np

In [2]:
data_dir = "/home/jiayu-huang/codebase/CRX-Active-Learning/Data/Sampling_Test/all_samples.csv"

In [3]:
df = pd.read_csv(data_dir)
df

Unnamed: 0,sequence,expression_log2
0,GTCACAGCTTTACAGCCTTCGGATGTTTCCAAAGTCTCCGCTCGAT...,1.596320
1,TAAGCTGAGGGTTACTTCCCATGGCCCTGAAGAATGAGAAGGCATT...,1.737424
2,GAGCCACTTGGCTTGGCGTAGGCTTTTGAGATCTCAAAGTCCACCA...,1.319990
3,CAGACAGTACTTTGAAGAACAGATGGTGTGGGCCCTTTGTTACCAT...,0.522135
4,CAGAGGGCTAGCAGGCCCAGGAGGAGGAGGAGCCAGGGCTGTTAAA...,-0.088277
...,...,...
12474,GTCGACAGATGTGAAGATCAAGCGTGGAAAACTCTTTAAAAAGAAT...,0.721225
12475,CGTAGCCTGACTTTGCCCTAAAAACCATCCTCTTCATTACCCTTAG...,0.720954
12476,ATGACATACGTTTAATTCCCCTTTAGTATAAGGTATGAATCCTGAG...,5.929735
12477,ATCCATGTCAGTAAAACCAAACAGAACCTTGACTGGATCTCTCGCA...,2.973909


In [4]:
from transformers import BertConfig

tokenizer = AutoTokenizer.from_pretrained("zhihan1996/DNABERT-2-117M", trust_remote_code=True)
config = BertConfig.from_pretrained("zhihan1996/DNABERT-2-117M")
model = AutoModel.from_config(config).cuda()

In [6]:
# Pre-allocate memory
embeddings_mean = np.zeros([len(df), 768])

# Define batch size
BATCH_SIZE = 256

for i in range(0, len(df), BATCH_SIZE):
    end_idx = min(i + BATCH_SIZE, len(df))
    batch_dnas = df.iloc[i:end_idx]['sequence'].tolist()
    
    with torch.no_grad():
        inputs = tokenizer(batch_dnas, return_tensors='pt', padding=True).to("cuda")
        hidden_states = model(**inputs).last_hidden_state
        
        # Mean pooling
        batch_embedding_mean = torch.mean(hidden_states, dim=1).cpu()
        embeddings_mean[i:end_idx] = batch_embedding_mean

In [7]:
embeddings_mean

array([[-2.67608851e-01,  6.32093251e-01, -3.98190588e-01, ...,
        -2.57434547e-01,  2.22299676e-02,  8.68904293e-01],
       [-1.78506762e-01,  5.03653467e-01, -2.53276706e-01, ...,
        -3.66730660e-01, -1.49027929e-01,  1.28093004e+00],
       [-2.60422850e-04,  5.50247610e-01, -3.39847237e-01, ...,
        -4.64525998e-01,  8.88472497e-02,  9.18240488e-01],
       ...,
       [-3.30259472e-01,  6.56379104e-01, -1.66249633e-01, ...,
        -1.82980910e-01,  3.34969938e-01,  8.62597287e-01],
       [-4.81922328e-02,  6.11081302e-01,  4.83461618e-02, ...,
        -6.28291965e-01,  1.87628314e-01,  8.64374638e-01],
       [-2.10417598e-01,  5.62864423e-01, -2.15571702e-01, ...,
        -2.68983599e-02,  7.15724677e-02,  6.72749937e-01]])

In [8]:
def euclidean_distance(a, b):
    return np.linalg.norm(a - b)

In [22]:
euclidean_distance(embeddings_mean[6],embeddings_mean[5454])

5.8689350925930865

In [14]:
def cosine_similarity(a, b):
    dot_product = np.dot(a, b)
    norm_a = np.linalg.norm(a)
    norm_b = np.linalg.norm(b)
    return dot_product / (norm_a * norm_b)