In [1]:
# import subprocess
# import os
# import json

# result = subprocess.run('bash -c "source /etc/network_turbo && env | grep proxy"', shell=True, capture_output=True, text=True)
# output = result.stdout
# for line in output.splitlines():
#     if '=' in line:
#         var, value = line.split('=', 1)
#         os.environ[var] = value

In [3]:
from datasets import load_dataset
from transformers import GPT2Tokenizer, GPT2LMHeadModel, Trainer, TrainingArguments, DataCollatorForLanguageModeling
import math

In [9]:
# 12. 使用微调后的模型对长文本进行段落分割
def split_text_sliding_window(text, tokenizer, max_length=1024, stride=256):
    """
    使用滑动窗口将超长文本切分为多个段，每段长度不超过 max_length。

    参数:
        text (str): 超长输入文本。
        tokenizer (PreTrainedTokenizer): 分词器。
        max_length (int): 分段的最大长度。
        stride (int): 滑动窗口的步长，控制段之间的重叠区域。

    返回:
        List[str]: 分割后的文本段。
    """
    # 分词
    tokenized_text = tokenizer.encode(text, add_special_tokens=False)

    # 滑动窗口分段
    chunks = []
    start = 0
    while start < len(tokenized_text):
        end = min(start + max_length, len(tokenized_text))
        chunks.append(tokenizer.decode(tokenized_text[start:end], skip_special_tokens=True))
        start += max_length - stride  # 移动窗口，保留 overlap

    return chunks

def segment_text(text, model_path="gpt2", top_k=15, max_length=1024, stride=256):
    """
    使用微调后的 GPT-2 模型对输入长文本进行段落分割，动态调整阈值。

    参数:
        text (str): 输入长文本。
        model_path (str): 微调模型的路径。
        top_k (int): 动态阈值计算时，选择预测分布中前 K 个概率的平均值。
        max_length (int): 模型最大输入长度。
        stride (int): 滑动窗口的步长。

    返回:
        str: 带有段落标记 <p_end> 的分割文本。
    """
    import torch

    # 加载模型和分词器
    tokenizer = GPT2Tokenizer.from_pretrained(model_path)
    model = GPT2LMHeadModel.from_pretrained(model_path)
    model.eval()  # 设置为评估模式

    # 分段文本
    chunks = split_text_sliding_window(text, tokenizer, max_length, stride)

    # 对每段文本进行推理
    results = []
    for chunk in chunks:
        inputs = tokenizer(chunk, return_tensors="pt", truncation=True, max_length=max_length)
        with torch.no_grad():
            outputs = model(inputs["input_ids"])
            logits = outputs.logits  # [batch_size, seq_length, vocab_size]

        # 找到 <p_end> 的 token ID
        p_end_id = tokenizer.convert_tokens_to_ids(".")

        # 计算 <p_end> 的概率分布
        probabilities = torch.softmax(logits, dim=-1)  # 转为概率分布
        p_end_probs = probabilities[0, :, p_end_id]  # [seq_length]

        # 动态计算阈值：取前 top_k 概率的平均值
        sorted_probs, _ = torch.sort(p_end_probs, descending=True)
        threshold = sorted_probs[:top_k].mean().item()

        # 根据动态阈值插入段落标记
        tokens = inputs["input_ids"][0].tolist()
        segmented_tokens = []
        for i, token in enumerate(tokens):
            segmented_tokens.append(token)
            if p_end_probs[i] > threshold:
                segmented_tokens.append(p_end_id)

        # 解码为文本
        segmented_text = tokenizer.decode(segmented_tokens, skip_special_tokens=False)
        results.append(segmented_text)

    # 合并结果
    return " ".join(results)

In [10]:
# 示例输入长文本
input_text = "Multilingual transfer ability, which reflects how well models fine-tuned on one source language can be applied to other languages, has been well studied in multilingual pre-trained models. However, the existence of such capability transfer between natural language and gene sequences/languages remains under explored.This study addresses this gap by drawing inspiration from the sentence-pair classification task used for evaluating sentence similarity in natural language. We constructed two analogous tasks: DNA-pair classification(DNA sequence similarity) and DNA-protein-pair classification(gene coding determination). These tasks were designed to validate the transferability of capabilities from natural language to gene sequences. Even a small-scale pre-trained model like GPT-2-small, which was pre-trained on English, achieved an accuracy of 78% on the DNA-pair classification task after being fine-tuned on English sentence-pair classification data(XTREME PAWS-X).  While training a BERT model on multilingual text, the precision reached 89%. On the more complex DNA-protein-pair classification task, however, the model's output was barely distinguishable from random output.Experimental validation has confirmed that the transfer of capabilities from natural language to biological language is unequivocally present. Building on this foundation, we have also investigated the impact of model parameter scale and pre-training on this capability transfer. We provide recommendations for facilitating the transfer of capabilities from natural language to genetic language,as well as new approaches for conducting biological research based on this capability.This study offers an intriguing new perspective on exploring the relationship between natural language and genetic language."
input_text_noseq = input_text.replace(".","")

# 调用分段函数
segmented_output = segment_text(input_text_noseq)
print(input_text)
print(segmented_output)


Multilingual transfer ability, which reflects how well models fine-tuned on one source language can be applied to other languages, has been well studied in multilingual pre-trained models. However, the existence of such capability transfer between natural language and gene sequences/languages remains under explored.This study addresses this gap by drawing inspiration from the sentence-pair classification task used for evaluating sentence similarity in natural language. We constructed two analogous tasks: DNA-pair classification(DNA sequence similarity) and DNA-protein-pair classification(gene coding determination). These tasks were designed to validate the transferability of capabilities from natural language to gene sequences. Even a small-scale pre-trained model like GPT-2-small, which was pre-trained on English, achieved an accuracy of 78% on the DNA-pair classification task after being fine-tuned on English sentence-pair classification data(XTREME PAWS-X).  While training a BERT mo