In [1]:
import subprocess
import os
import json

result = subprocess.run('bash -c "source /etc/network_turbo && env | grep proxy"', shell=True, capture_output=True, text=True)
output = result.stdout
for line in output.splitlines():
    if '=' in line:
        var, value = line.split('=', 1)
        os.environ[var] = value

In [2]:
from transformers import AutoTokenizer, DataCollatorWithPadding

#分词器
tokenizer = AutoTokenizer.from_pretrained("dnagpt/gene_eng_gpt2_v1_ft")

In [3]:
word_dict = tokenizer.get_vocab()
len(word_dict)

100000

In [4]:
from transformers import GPT2Tokenizer, GPT2Model,AutoModel
import torch
model_name="dnagpt/gene_eng_gpt2_v1_ft"
device="cuda"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name)
model.to(device)

def get_text_embedding(text):
    """
    使用 GPT-2 模型获取文本的向量表示。
    
    参数:
        text (str): 输入文本。
        model_name (str): 预训练 GPT-2 模型名称，默认为 "gpt2"。
        device (str): 设备名称（"cpu" 或 "cuda"）。
    
    返回:
        torch.Tensor: 文本的向量表示，维度为 [hidden_size]。
    """

    # 将文本编码为输入 ID 并添加批量维度
    inputs = tokenizer(text, return_tensors="pt", truncation=True, max_length=512)
    input_ids = inputs["input_ids"].to(device)
    attention_mask = inputs["attention_mask"].to(device)
    
    # 获取模型的隐藏层输出
    with torch.no_grad():
        outputs = model(input_ids, attention_mask=attention_mask)
        hidden_states = outputs.last_hidden_state  # [batch_size, seq_length, hidden_size]
    
    # 平均池化：获取序列中所有词向量的平均值
    embeddings = hidden_states.mean(dim=1).squeeze()  # [hidden_size]
    
    return embeddings

In [5]:
def classify_sequence(sequence):
    # 定义字符集（所有字符都假设为大写）
    dna_chars = set('ACGT')
    protein_chars = set('ACDEFGHIKLMNPQRSTVWY')
    english_chars = set('ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789 ,.!?:;-"\'()')

    # 去除空格并检查长度
    sequence = sequence.strip()  # 
    
    # 检查是否为DNA序列
    if all(c in dna_chars for c in sequence):
        return "DNA"
    
    # 检查是否为蛋白质序列
    if all(c in protein_chars for c in sequence):
        return "Protein"
    
    # 检查是否为英文文本（允许大小写字母、数字及常见标点符号）
    if all(c in english_chars for c in sequence):
        return "English"
    
    # 如果不符合上述任何条件，则无法明确分类
    return "Unknown"

# 示例用法
sequences = ["AGCT", "MVLFRSSGYV", "HELLO WORLD", "AGCZ", "XYZ", "A T G C", "HELLO, WORLD!", "ABC"]
for seq in sequences:
    print(f"'{seq}' is classified as: {classify_sequence(seq)}")

'AGCT' is classified as: DNA
'MVLFRSSGYV' is classified as: Protein
'HELLO WORLD' is classified as: English
'AGCZ' is classified as: English
'XYZ' is classified as: English
'A T G C' is classified as: English
'HELLO, WORLD!' is classified as: English
'ABC' is classified as: English


In [6]:
#获得DNA和英文词表  只要长度2个及以上的词
dna_word_list = []
eng_word_list = []
protein_word_list = []

for word in word_dict:
    if len(word)>=2:
        word_type = classify_sequence(word)
        if "DNA"==word_type:
            dna_word_list.append(word)

        if "Protein"==word_type:
            protein_word_list.append(word)

        if "English"==word_type:
            eng_word_list.append(word)

        

print(len(dna_word_list), len(eng_word_list), len(protein_word_list))

19409 24193 40124


In [8]:
dna_word_vect_dict = {}
eng_word_vect_dict = {}
for word in dna_word_list:
    word_vect = get_text_embedding(word)
    dna_word_vect_dict[word] = word_vect

In [9]:
for word in eng_word_list:
    word_vect = get_text_embedding(word)
    eng_word_vect_dict[word] = word_vect

In [13]:
from sklearn.neighbors import NearestNeighbors
import numpy as np

def find_most_similar_optimized(dna_word_vect_dict, eng_word_vect_dict):
    """
    使用 KD-Tree 加速 DNA 单词到英文单词的匹配。
    
    参数:
        dna_word_vect_dict (dict): DNA 单词与其向量的字典 {dna_word: dna_vector}.
        eng_word_vect_dict (dict): 英文单词与其向量的字典 {eng_word: eng_vector}.
    
    返回:
        dict: DNA 单词到英文单词的映射词典 {dna_word: most_similar_eng_word}.
    """
    # 构建英文单词向量矩阵和对应单词列表
    eng_words = list(eng_word_vect_dict.keys())
    
    # 确保向量在 CPU 上并转换为 NumPy 数组
    eng_vectors = np.array([v.cpu().numpy() if isinstance(v, torch.Tensor) else v for v in eng_word_vect_dict.values()])
    
    # 初始化最近邻搜索模型
    nn_model = NearestNeighbors(metric="cosine").fit(eng_vectors)
    
    dna_eng_dict = {}
    
    for dna_word, dna_vector in dna_word_vect_dict.items():
        # 将 DNA 向量确保在 CPU 并转换为 NumPy 数组
        if isinstance(dna_vector, torch.Tensor):
            dna_vector = dna_vector.cpu().numpy()
        
        # 查找最近的英文单词
        distances, indices = nn_model.kneighbors([dna_vector], n_neighbors=1)
        most_similar_eng_word = eng_words[indices[0][0]]
        
        # 记录匹配结果
        dna_eng_dict[dna_word] = most_similar_eng_word
    
    return dna_eng_dict

# 示例调用
dna_eng_dict_optimized = find_most_similar_optimized(dna_word_vect_dict, eng_word_vect_dict)

In [14]:
import json

# 将 dna_eng_dict_optimized 保存到 JSON 文件中
def save_dict_to_json(data_dict, file_path):
    """
    将字典保存为 JSON 文件。
    
    参数:
        data_dict (dict): 要保存的字典。
        file_path (str): 保存 JSON 文件的路径。
    """
    with open(file_path, 'w', encoding='utf-8') as json_file:
        json.dump(data_dict, json_file, ensure_ascii=False, indent=4)

# 示例调用
save_dict_to_json(dna_eng_dict_optimized, "dna_eng_dict_optimized.json")
print("DNA-English dictionary has been saved to dna_eng_dict_optimized.json.")

DNA-English dictionary has been saved to dna_eng_dict_optimized.json.


In [15]:
en_word_dict = {}
for dna_word in dna_eng_dict_optimized:
    en_word = dna_eng_dict_optimized[dna_word]
    en_word_dict.setdefault(en_word,0)
    en_word_dict[en_word] = en_word_dict[en_word] + 1

en_word_dict

{'olia': 5117,
 'umbai': 2040,
 'stic': 27,
 'peninsula': 2966,
 'iciency': 22,
 'eleph': 73,
 'pson': 446,
 'ala': 589,
 'politan': 2219,
 'https': 2,
 'transported': 1883,
 'icking': 1249,
 'displaystyle': 53,
 'cemet': 10,
 'icipal': 1138,
 'coln': 54,
 'idence': 108,
 'atherine': 47,
 'olph': 108,
 'beha': 39,
 'desirable': 121,
 'atting': 26,
 'inflamm': 14,
 'surroundings': 85,
 'mamm': 221,
 'demean': 5,
 'hower': 52,
 'annah': 19,
 'ushima': 54,
 'oples': 3,
 'enty': 30,
 'directions': 1,
 'apore': 21,
 'duc': 31,
 'XXXXXXXX': 2,
 'unsupported': 1,
 'electro': 21,
 'ashed': 46,
 'T1': 4,
 'ometimes': 54,
 'ancing': 1,
 'mechanic': 5,
 'atican': 16,
 'entirety': 6,
 'archite': 2,
 'employs': 12,
 'Resour': 1,
 'enjoyable': 2,
 'ving': 3,
 'rance': 11,
 'northwest': 8,
 'ampions': 13,
 'XXXXXXXXXXXX': 5,
 'Weap': 5,
 'XT': 7,
 'amen': 21,
 'Duter': 4,
 'ampion': 1,
 'agonal': 1,
 'involve': 4,
 'underneath': 2,
 'rought': 19,
 'Carneg': 11,
 'antibi': 4,
 'inery': 13,
 'tural': 7

In [25]:
import random
from sklearn.neighbors import NearestNeighbors
import numpy as np
import torch

def find_most_similar_with_randomization(dna_word_vect_dict, eng_word_vect_dict, top_k=500):
    """
    使用 KD-Tree 加速 DNA 单词到英文单词的匹配，并随机选择最近的 top_k 单词中的一个作为映射。

    参数:
        dna_word_vect_dict (dict): DNA 单词与其向量的字典 {dna_word: dna_vector}.
        eng_word_vect_dict (dict): 英文单词与其向量的字典 {eng_word: eng_vector}.
        top_k (int): 随机选择时从最近的 top_k 单词中选取。

    返回:
        dict: DNA 单词到英文单词的映射词典 {dna_word: random_eng_word_from_top_k}.
    """
    # 构建英文单词向量矩阵和对应单词列表
    eng_words = list(eng_word_vect_dict.keys())
    
    # 确保向量在 CPU 上并转换为 NumPy 数组
    eng_vectors = np.array([v.cpu().numpy() if isinstance(v, torch.Tensor) else v for v in eng_word_vect_dict.values()])

    # 初始化最近邻搜索模型
    nn_model = NearestNeighbors(metric="cosine").fit(eng_vectors)

    dna_eng_dict = {}

    for dna_word, dna_vector in dna_word_vect_dict.items():
        # 将 DNA 向量确保在 CPU 并转换为 NumPy 数组
        if isinstance(dna_vector, torch.Tensor):
            dna_vector = dna_vector.cpu().numpy()

        # 查找最近的 top_k 英文单词
        distances, indices = nn_model.kneighbors([dna_vector], n_neighbors=top_k)
        top_k_eng_words = [eng_words[idx] for idx in indices[0]]

        # 随机选择一个单词
        random_eng_word = random.choice(top_k_eng_words)

        # 记录匹配结果
        dna_eng_dict[dna_word] = random_eng_word

    return dna_eng_dict

# 示例调用
dna_eng_dict_randomized = find_most_similar_with_randomization(dna_word_vect_dict, eng_word_vect_dict, top_k=100)

In [32]:
en_word_dict = {}
for dna_word in dna_eng_dict_randomized:
    en_word = dna_eng_dict_randomized[dna_word]
    en_word_dict.setdefault(en_word,0)
    en_word_dict[en_word] = en_word_dict[en_word] + 1

len(en_word_dict)

618

In [33]:
def add_unique_suffix_to_dict(dna_eng_dict):
    """
    为 DNA 到英文单词的映射词典添加唯一后缀，防止多个 DNA 单词对应同一个英文单词。

    参数:
        dna_eng_dict (dict): {dna_word: eng_word} 形式的映射词典。

    返回:
        dict: 添加后缀后的映射词典。
    """
    # 统计每个英文单词的映射次数
    eng_word_count = {}
    for dna_word, eng_word in dna_eng_dict.items():
        if eng_word not in eng_word_count:
            eng_word_count[eng_word] = 0
        eng_word_count[eng_word] += 1

    # 为映射次数超过 1 的英文单词添加后缀
    eng_word_suffix_count = {key: 1 for key in eng_word_count.keys()}
    updated_dict = {}
    for dna_word, eng_word in dna_eng_dict.items():
        if eng_word_count[eng_word] > 1:
            # 添加后缀
            unique_eng_word = f"{eng_word}{eng_word_suffix_count[eng_word]}"
            eng_word_suffix_count[eng_word] += 1
        else:
            unique_eng_word = eng_word
        updated_dict[dna_word] = unique_eng_word

    return updated_dict

# 示例调用
dna_eng_dict_unique = add_unique_suffix_to_dict(dna_eng_dict_randomized)

In [35]:
import json

# 保存 dna_eng_dict_unique 到 JSON 文件
output_file = "dna_eng_dict_unique.json"

with open(output_file, "w") as f:
    json.dump(dna_eng_dict_unique, f, indent=4)