In [1]:
# import os

# # 设置环境变量
# os.environ['HF_ENDPOINT'] = 'https://hf-mirror.com'

# # 打印环境变量以确认设置成功
# print(os.environ.get('HF_ENDPOINT'))

import subprocess
import os

result = subprocess.run('bash -c "source /etc/network_turbo && env | grep proxy"', shell=True, capture_output=True, text=True)
output = result.stdout
for line in output.splitlines():
    if '=' in line:
        var, value = line.split('=', 1)
        os.environ[var] = value

In [2]:
from datasets import load_dataset
from transformers import AutoTokenizer
import numpy as np

# ==================== 配置 ====================
MODEL_NAME = "gpt2"  # 你用的 tokenizer
DATASET_NAME = "dnagpt/biopaws"

SUBSETS = ["protein_pair_short", "protein_pair_full"]

tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

def analyze_length(subset_name):
    print(f"\n=== 分析子集: {subset_name} ===")
    dataset = load_dataset(DATASET_NAME, subset_name)["train"]
    
    # 计算所有序列的 token 长度
    lengths1 = []
    lengths2 = []
    for item in dataset:
        len1 = len(tokenizer.encode(item["sentence1"], add_special_tokens=False))
        len2 = len(tokenizer.encode(item["sentence2"], add_special_tokens=False))
        lengths1.append(len1)
        lengths2.append(len2)
    
    all_lengths = lengths1 + lengths2
    all_lengths = np.array(all_lengths)
    
    print(f"总序列数: {len(all_lengths)}")
    print(f"氨基酸长度统计（近似，token数 ≈ aa数 × 1.5）:")
    print(f"  Min: {all_lengths.min()}")
    print(f"  25%: {np.percentile(all_lengths, 25):.0f}")
    print(f"  50% (中位数): {np.percentile(all_lengths, 50):.0f}")
    print(f"  75%: {np.percentile(all_lengths, 75):.0f}")
    print(f"  90%: {np.percentile(all_lengths, 90):.0f}")
    print(f"  95%: {np.percentile(all_lengths, 95):.0f}")
    print(f"  99%: {np.percentile(all_lengths, 99):.0f}")
    print(f"  Max: {all_lengths.max()}")
    
    # 推荐 max_length（保留 95% 或 99% 序列不截断）
    for perc in [90, 95, 99]:
        val = np.percentile(all_lengths, perc)
        print(f"  保留 {perc}% 序列完整 → 建议 max_length = {int(val) + 10}")  # +10 留余量
    
    return all_lengths

# 运行统计
for subset in SUBSETS:
    analyze_length(subset)


=== 分析子集: protein_pair_short ===
总序列数: 40000
氨基酸长度统计（近似，token数 ≈ aa数 × 1.5）:
  Min: 21
  25%: 63
  50% (中位数): 88
  75%: 114
  90%: 132
  95%: 140
  99%: 148
  Max: 169
  保留 90% 序列完整 → 建议 max_length = 142
  保留 95% 序列完整 → 建议 max_length = 150
  保留 99% 序列完整 → 建议 max_length = 158

=== 分析子集: protein_pair_full ===


Using the latest cached version of the dataset since dnagpt/biopaws couldn't be found on the Hugging Face Hub
Found the latest cached dataset configuration 'protein_pair_full' at /root/.cache/huggingface/datasets/dnagpt___biopaws/protein_pair_full/0.0.0/04f23788fc267a3ab45cd27f39e8d6566a412a41 (last modified on Tue Dec 30 20:40:55 2025).
Token indices sequence length is longer than the specified maximum sequence length for this model (1283 > 1024). Running this sequence through the model will result in indexing errors


总序列数: 40000
氨基酸长度统计（近似，token数 ≈ aa数 × 1.5）:
  Min: 15
  25%: 87
  50% (中位数): 142
  75%: 219
  90%: 310
  95%: 400
  99%: 675
  Max: 4516
  保留 90% 序列完整 → 建议 max_length = 320
  保留 95% 序列完整 → 建议 max_length = 410
  保留 99% 序列完整 → 建议 max_length = 685
