In [1]:
# import os

# # 设置环境变量
# os.environ['HF_ENDPOINT'] = 'https://hf-mirror.com'

# # 打印环境变量以确认设置成功
# print(os.environ.get('HF_ENDPOINT'))

import subprocess
import os

result = subprocess.run('bash -c "source /etc/network_turbo && env | grep proxy"', shell=True, capture_output=True, text=True)
output = result.stdout
for line in output.splitlines():
    if '=' in line:
        var, value = line.split('=', 1)
        os.environ[var] = value

In [3]:
from datasets import load_dataset
import random
from itertools import combinations
import pandas as pd

# ==================== 步骤1：加载数据集 ====================
print("正在加载 filtered-SCOPe-2.08 数据集...")
dataset = load_dataset("vkarthik095/filtered-SCOPe-2.08")["train"]

print(f"总序列数: {len(dataset)}")
print("示例数据字段:", dataset[0].keys())
print("示例数据:", dataset[0])

# ==================== 步骤2：按 superfamily 分组 ====================
superfamily_to_items = {}

for item in dataset:
    seq = item["primary"]  # 关键修正：用 'primary' 而不是 'sequence'
    superfamily = item["super_family"]  # e.g., "a.1.1"
    family = item["family"]             # e.g., "a.1.1.1"
    
    # 可选：长度过滤，与你的 short 数据集对齐
    if not (40 <= len(seq) <= 250):
        continue
    
    key = (superfamily, family)
    if superfamily not in superfamily_to_items:
        superfamily_to_items[superfamily] = {}
    if key not in superfamily_to_items[superfamily]:
        superfamily_to_items[superfamily][key] = []
    superfamily_to_items[superfamily][key].append(seq)

print(f"有效 superfamily 数量: {len(superfamily_to_items)}")

# ==================== 步骤3：构建正样本（同一 superfamily，不同 family） ====================
pos_pairs = []
print("构建正样本（同一 superfamily，不同 family）...")
for superfamily, family_dict in superfamily_to_items.items():
    families = list(family_dict.keys())
    if len(families) < 2:
        continue  # 需要至少两个不同 family
    
    # 不同 family 之间两两组合
    for i in range(len(families)):
        for j in range(i + 1, len(families)):
            seqs1 = family_dict[families[i]]
            seqs2 = family_dict[families[j]]
            for s1 in seqs1:
                for s2 in seqs2:
                    pos_pairs.append({
                        "sentence1": s1,
                        "sentence2": s2,
                        "label": 1
                    })

print(f"正样本数量: {len(pos_pairs)}")

# ==================== 步骤4：构建负样本（不同 superfamily） ====================
neg_pairs = []
print("构建负样本（不同 superfamily）...")
all_seqs = [item["primary"] for item in dataset if 40 <= len(item["primary"]) <= 250]

superfamilies = list(superfamily_to_items.keys())
target_neg = len(pos_pairs)  # 平衡正负

count = 0
while count < target_neg:
    sf1, sf2 = random.sample(superfamilies, 2)
    # 从 sf1 和 sf2 各随机取一个序列
    fam1 = random.choice(list(superfamily_to_items[sf1].keys()))
    fam2 = random.choice(list(superfamily_to_items[sf2].keys()))
    s1 = random.choice(superfamily_to_items[sf1][fam1])
    s2 = random.choice(superfamily_to_items[sf2][fam2])
    neg_pairs.append({
        "sentence1": s1,
        "sentence2": s2,
        "label": 0
    })
    count += 1

print(f"负样本数量: {len(neg_pairs)}")

# ==================== 步骤5：合并、打乱、保存 ====================
all_pairs = pos_pairs + neg_pairs
random.shuffle(all_pairs)

df = pd.DataFrame(all_pairs)

print(f"最终远程同源测试集规模: {len(df)} 条")
print(f"  正样本 (同超家族不同家族): {len(df[df['label']==1])}")
print(f"  负样本 (不同超家族): {len(df[df['label']==0])}")

# 保存
df.to_csv("protein_pair_remote.csv", index=False)
print("测试集已保存为 protein_pair_remote.csv")

# 可选：上传到你的 biopaws 数据集
# from huggingface_hub import login
# login(token="your_token")
# from datasets import Dataset
# hf_ds = Dataset.from_pandas(df)
# hf_ds.push_to_hub("dnagpt/biopaws", config_name="remote-homology-scop")

正在加载 filtered-SCOPe-2.08 数据集...
总序列数: 14535
示例数据字段: dict_keys(['id', 'primary', 'protein_length', 'class', 'fold', 'super_family', 'family', 'description'])
示例数据: {'id': 'd1dlwa_', 'primary': 'slfeqlggqaavqavtaqfyaniqadatvatffngidmpnqtnktaaflcaalggpnawtgrnlkevhanmgvsnaqfttvighlrsaltgagvaaalveqtvavaetvrgdvvtv', 'protein_length': 116, 'class': 'a', 'fold': 'a.1', 'super_family': 'a.1.1', 'family': 'a.1.1.1', 'description': 'd1dlwa_ a.1.1.1 (A:) Protozoan/bacterial hemoglobin {Ciliate (Paramecium caudatum) [TaxId: 5885]}'}
有效 superfamily 数量: 1754
构建正样本（同一 superfamily，不同 family）...
正样本数量: 179702
构建负样本（不同 superfamily）...
负样本数量: 179702
最终远程同源测试集规模: 359404 条
  正样本 (同超家族不同家族): 179702
  负样本 (不同超家族): 179702
测试集已保存为 protein_pair_remote.csv


In [4]:
# 可选：上传到你的 biopaws 数据集
from huggingface_hub import login
login(token="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")
from datasets import Dataset
hf_dataset = Dataset.from_pandas(df)
hf_dataset.push_to_hub("dnagpt/biopaws", config_name="protein_pair_remote")

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ? shards/s]

Creating parquet from Arrow format:   0%|          | 0/360 [00:00<?, ?ba/s]

Processing Files (0 / 0)                : |          |  0.00B /  0.00B            

New Data Upload                         : |          |  0.00B /  0.00B            

                                        :   7%|7         | 6.29MB / 88.8MB            

CommitInfo(commit_url='https://huggingface.co/datasets/dnagpt/biopaws/commit/6404c36e42e9f7b83cadcdd70029042757eb8b57', commit_message='Upload dataset', commit_description='', oid='6404c36e42e9f7b83cadcdd70029042757eb8b57', pr_url=None, repo_url=RepoUrl('https://huggingface.co/datasets/dnagpt/biopaws', endpoint='https://huggingface.co', repo_type='dataset', repo_id='dnagpt/biopaws'), pr_revision=None, pr_num=None)