In [1]:
import torch
from transformers import AutoTokenizer, AutoModel
from torch.utils.data import Dataset
import pandas as pd
import selene_sdk
import numpy as np

In [2]:
data_dir = "/home/jiayu-huang/codebase/CRX-Active-Learning/Data/Sampling_Test/all_samples.csv"

In [3]:
df = pd.read_csv(data_dir)
df

Unnamed: 0,sequence,expression_log2
0,GTCACAGCTTTACAGCCTTCGGATGTTTCCAAAGTCTCCGCTCGAT...,1.596320
1,TAAGCTGAGGGTTACTTCCCATGGCCCTGAAGAATGAGAAGGCATT...,1.737424
2,GAGCCACTTGGCTTGGCGTAGGCTTTTGAGATCTCAAAGTCCACCA...,1.319990
3,CAGACAGTACTTTGAAGAACAGATGGTGTGGGCCCTTTGTTACCAT...,0.522135
4,CAGAGGGCTAGCAGGCCCAGGAGGAGGAGGAGCCAGGGCTGTTAAA...,-0.088277
...,...,...
12474,GTCGACAGATGTGAAGATCAAGCGTGGAAAACTCTTTAAAAAGAAT...,0.721225
12475,CGTAGCCTGACTTTGCCCTAAAAACCATCCTCTTCATTACCCTTAG...,0.720954
12476,ATGACATACGTTTAATTCCCCTTTAGTATAAGGTATGAATCCTGAG...,5.929735
12477,ATCCATGTCAGTAAAACCAAACAGAACCTTGACTGGATCTCTCGCA...,2.973909


In [4]:
from transformers import BertConfig

tokenizer = AutoTokenizer.from_pretrained("zhihan1996/DNABERT-2-117M", trust_remote_code=True)
config = BertConfig.from_pretrained("zhihan1996/DNABERT-2-117M")
model = AutoModel.from_config(config).cuda()

In [5]:
# Pre-allocate memory
embeddings_mean = np.zeros([len(df), 768])

# Define batch size
BATCH_SIZE = 256

for i in range(0, len(df), BATCH_SIZE):
    end_idx = min(i + BATCH_SIZE, len(df))
    batch_dnas = df.iloc[i:end_idx]['sequence'].tolist()
    
    with torch.no_grad():
        inputs = tokenizer(batch_dnas, return_tensors='pt', padding=True).to("cuda")
        hidden_states = model(**inputs).last_hidden_state
        
        # Mean pooling
        batch_embedding_mean = torch.mean(hidden_states, dim=1).cpu()
        embeddings_mean[i:end_idx] = batch_embedding_mean

In [6]:
embeddings_mean.shape

(12479, 768)

In [7]:
def euclidean_distance(a, b):
    return np.linalg.norm(a - b)

In [12]:
def find_max_distance_indices(data, start_idx, num_iterations):
    # List to store the indices of the data points
    selected_indices = [start_idx]
    current_point = data[start_idx]

    for i in range(num_iterations):
        max_distance = -1
        max_distance_idx = -1

        for idx, point in enumerate(data):
            if idx not in selected_indices:
                distance = euclidean_distance(current_point, point)
                if distance > max_distance:
                    max_distance = distance
                    max_distance_idx = idx

        if max_distance_idx != -1:
            selected_indices.append(max_distance_idx)
            current_point = data[max_distance_idx]
        else:
            # Break if we cannot find a point further away (although very unlikely)
            break

    return selected_indices

In [13]:
selected_indices = find_max_distance_indices(embeddings_mean, 10829, 3000)

In [15]:
selected_indices = np.array(selected_indices)

In [16]:
largest_seqs = df.iloc[selected_indices]
largest_seqs

Unnamed: 0,sequence,expression_log2
10829,AGGATTTAGGGAATGATCCGACAGATTTCCACCAATTAGTATAGGA...,-1.974335
1144,TCCCTGAGCTCACTGGTTGCCTAGACAGAACCTCTCACTGCTCCCT...,1.517654
2741,AGGAACACACTCCGCACTGCCCTCGCGTCGGACGCGAGGGGGCGTG...,-0.096106
2991,TGTTCACTACGCCTTCTGCCACTGTATCCACTACGCCTTCTGCCAC...,1.346613
1248,AGCCTCACCCTAAATTTAGACCCCAGCGTATTCTGACGACAGCCTG...,2.144198
...,...,...
8033,TTAAGCTCTATAGCTTGGTAAGACTATTGATTACCTTTCTCCCTTG...,3.293132
10121,ATAGGGTATGGACCGAGGCGAGGGGGCTCTAATCTACATATAAAGG...,2.900259
2751,TCTGGGTCTGGTTCAGTGAGCCCCTGAGGTTCACGGGTTGAAATTT...,2.382437
4502,CACGCCCACTGATTTGTGACACTAGGGCCTACTTAGTTTTAGAGTT...,-2.651467


In [18]:
largest_seqs.to_csv("/home/jiayu-huang/codebase/CRX-Active-Learning/Data/Sampling_Test/largest_samples_distance.csv", index=False)