In [1]:
# import subprocess
# import os
# import json

# result = subprocess.run('bash -c "source /etc/network_turbo && env | grep proxy"', shell=True, capture_output=True, text=True)
# output = result.stdout
# for line in output.splitlines():
#     if '=' in line:
#         var, value = line.split('=', 1)
#         os.environ[var] = value


import os

# 设置环境变量
os.environ['HF_ENDPOINT'] = 'https://hf-mirror.com'

# 打印环境变量以确认设置成功
print(os.environ.get('HF_ENDPOINT'))

https://hf-mirror.com


In [2]:
from datasets import load_dataset
from transformers import GPT2Tokenizer, GPT2LMHeadModel, Trainer, TrainingArguments, DataCollatorForLanguageModeling,AutoTokenizer
import math
import torch

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
# 检查设备
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

device(type='cuda')

In [4]:
model_path="dnagpt/gene_eng_gpt2_v1_ft"
# 加载模型和分词器
tokenizer = AutoTokenizer.from_pretrained(model_path)
model = GPT2LMHeadModel.from_pretrained(model_path)
model.to(device)  # 将模型加载到设备上
model.eval()  # 设置为评估模式

GPT2LMHeadModel(
  (transformer): GPT2Model(
    (wte): Embedding(100000, 768)
    (wpe): Embedding(256, 768)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0-11): 12 x GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2SdpaAttention(
          (c_attn): Conv1D(nf=2304, nx=768)
          (c_proj): Conv1D(nf=768, nx=768)
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D(nf=3072, nx=768)
          (c_proj): Conv1D(nf=768, nx=3072)
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
    )
    (ln_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
  )
  (lm_head): Linear(in_features=768, out_features=100000, bias=False)
)

In [5]:
def split_text_sliding_window(text, max_length=256, stride=64):
    """
    使用滑动窗口将超长文本切分为多个段，每段长度不超过 max_length。
    """
    # 分词
    tokenized_text = tokenizer.encode(text, add_special_tokens=False)

    # 滑动窗口分段
    chunks = []
    start = 0
    while start < len(tokenized_text):
        end = min(start + max_length, len(tokenized_text))
        chunks.append(tokenizer.decode(tokenized_text[start:end], skip_special_tokens=True))
        start += max_length - stride  # 移动窗口，保留 overlap

    return chunks

def segment_text(text, top_k=15, max_length=256, stride=64):
    """
    使用微调后的 GPT-2 模型对输入长文本进行段落分割，动态调整阈值。
    """
    # 分段文本
    chunks = split_text_sliding_window(text, max_length, stride)

    # 对每段文本进行推理
    results = []
    for chunk in chunks:
        inputs = tokenizer(chunk, return_tensors="pt", truncation=True, max_length=max_length)
        inputs = {key: val.to(device) for key, val in inputs.items()}  # 将所有张量移到同一设备

        with torch.no_grad():
            outputs = model(**inputs)  # 确保模型和输入都在同一设备
            logits = outputs.logits  # [batch_size, seq_length, vocab_size]

        # 找到 <p_end> 的 token ID
        p_end_id = tokenizer.convert_tokens_to_ids(".")

        # 计算 <p_end> 的概率分布
        probabilities = torch.softmax(logits, dim=-1)  # 转为概率分布
        p_end_probs = probabilities[0, :, p_end_id].to(device)  # 确保在相同设备上

        # 动态计算阈值：取前 top_k 概率的平均值
        sorted_probs, _ = torch.sort(p_end_probs, descending=True)
        threshold = sorted_probs[:top_k].mean().item()

        # 根据动态阈值插入段落标记
        tokens = inputs["input_ids"][0].tolist()
        segmented_tokens = []
        for i, token in enumerate(tokens):
            segmented_tokens.append(token)
            if p_end_probs[i].item() > threshold:
                segmented_tokens.append(p_end_id)

        # 解码为文本
        segmented_text = tokenizer.decode(segmented_tokens, skip_special_tokens=False)
        results.append(segmented_text)

    # 合并结果
    return " ".join(results)


In [6]:
# 示例输入长文本
input_text = "Multilingual transfer ability, which reflects how well models fine-tuned on one source language can be applied to other languages, has been well studied in multilingual pre-trained models. However, the existence of such capability transfer between natural language and gene sequences/languages remains under explored.This study addresses this gap by drawing inspiration from the sentence-pair classification task used for evaluating sentence similarity in natural language. We constructed two analogous tasks: DNA-pair classification(DNA sequence similarity) and DNA-protein-pair classification(gene coding determination). These tasks were designed to validate the transferability of capabilities from natural language to gene sequences. Even a small-scale pre-trained model like GPT-2-small, which was pre-trained on English, achieved an accuracy of 78% on the DNA-pair classification task after being fine-tuned on English sentence-pair classification data(XTREME PAWS-X).  While training a BERT model on multilingual text, the precision reached 89%. On the more complex DNA-protein-pair classification task, however, the model's output was barely distinguishable from random output.Experimental validation has confirmed that the transfer of capabilities from natural language to biological language is unequivocally present. Building on this foundation, we have also investigated the impact of model parameter scale and pre-training on this capability transfer. We provide recommendations for facilitating the transfer of capabilities from natural language to genetic language,as well as new approaches for conducting biological research based on this capability.This study offers an intriguing new perspective on exploring the relationship between natural language and genetic language."
input_text = "ATTTACTACAGTGGACATCAAGGGCACATTCTTGCTGTGGCCATCAAGAGACTGTATAAATTCTATGACTTGTAGTTGTCCCACTTAAGAAACAAAGAAGCTGTGCATTTCTTTACTGGTCTAGAGCTGCTCTAGGGCATTTTCTCTACAGCAATTCTAGGTTTCCCCACCTTGTGAGTTTAGCTTTTTCTATATTCAAAGAAAAGTCCTCAGCCAGAGATTCTCAGGAGCTTATAGAACAATCCAAACTCTTGGGAATATTAAGTGGAGAGGGGTACGTGCAAGACACCAACAGCACTAGAAACAG"
input_text_noseq = input_text.replace(".","")

# 调用分段函数
segmented_output = segment_text(input_text_noseq)
print(input_text)
print(segmented_output)


ATTTACTACAGTGGACATCAAGGGCACATTCTTGCTGTGGCCATCAAGAGACTGTATAAATTCTATGACTTGTAGTTGTCCCACTTAAGAAACAAAGAAGCTGTGCATTTCTTTACTGGTCTAGAGCTGCTCTAGGGCATTTTCTCTACAGCAATTCTAGGTTTCCCCACCTTGTGAGTTTAGCTTTTTCTATATTCAAAGAAAAGTCCTCAGCCAGAGATTCTCAGGAGCTTATAGAACAATCCAAACTCTTGGGAATATTAAGTGGAGAGGGGTACGTGCAAGACACCAACAGCACTAGAAACAG
ATTTAC TACAGTGG ACATC AAGGGC ACATTC TTGC TGTGGCC ATCAAG AGACTG TATAAATTC TATG ACTTG TAGTTG TCCC ACTT AAGAAACAA AGAAGC TGTGC ATTTCTT TACTGG TCTAG AGCTGC TCTAGGGC ATTTTC TCTAC AGCAA TTCTAGG TTTCCCC ACCTTG TGAG TTTAGC TTTT TCTATA TTCAAAG AAAAGTCC TCAGCC AGAGATTC TCAGGAGC TTATAG AACAA TCCAAAC TCTT GGGAA TATT AAGTGG AGAGGGG TACG TGCAAG . ACACC AACAGC ACTAGAA . ACAG


In [7]:
import json
#读取第1层的数据
json_str = open("p1_cluster_data_with_title.json","r").read()
p1_cluster_data_with_title = json.loads(json_str)

In [None]:
p1_cluster_data_with_title_sentence = []
for item in p1_cluster_data_with_title:
    para_list = item["para_list"]
    #对每个段落进行分词分句
    para_sentence_list = []
    for para in para_list:
        segmented_output = segment_text(para)
        para_sentence_list.append(segmented_output)

    item["para_sentence_list"] = para_sentence_list

    p1_cluster_data_with_title_sentence.append(item)

In [None]:
import json

# 保存到 JSON 文件
with open("p1_cluster_data_with_title_sentence.json", "w", encoding="utf-8") as f:
    json.dump(p1_cluster_data_with_title_sentence, f, ensure_ascii=False, indent=4)

print("数据已保存到 p1_cluster_data_with_title_sentence.json")