In [4]:
import pandas as pd
import numpy as np
import os

# 1. 定义文件路径
file1 = r'parquet/val2025.parquet'
file2 = r'parquet/train2025.parquet'
file3 = r'parquet/test2025.parquet'

output_dir = r'parquet/parquet0218'
os.makedirs(output_dir, exist_ok=True)

output_files = {
    'file1': os.path.join(output_dir, 'split1.parquet'),
    'file2': os.path.join(output_dir, 'split2.parquet'),
    'file3': os.path.join(output_dir, 'split3.parquet')
}

# 2. 读取 Parquet 文件
df1 = pd.read_parquet(file1)
df2 = pd.read_parquet(file2)
df3 = pd.read_parquet(file3)

# 3. 合并 DataFrame
combined_df = pd.concat([df1, df2, df3], ignore_index=True)

# 4. 打乱数据（可选）
combined_df = combined_df.sample(frac=1, random_state=42).reset_index(drop=True)

# 5. 根据比例拆分数据
# 计算总行数
total_rows = len(combined_df)

# 计算每个部分的大小
split1_size = int(0.8 * total_rows)
split2_size = int(0.1 * total_rows)
split3_size = total_rows - split1_size - split2_size

# 确保总和等于总行数
if split1_size + split2_size + split3_size != total_rows:
    split3_size += (total_rows - (split1_size + split2_size + split3_size))

# 使用 numpy 的随机选择来分配行索引
indices = np.arange(total_rows)
np.random.shuffle(indices)

split1_indices = indices[:split1_size]
split2_indices = indices[split1_size:split1_size + split2_size]
split3_indices = indices[split1_size + split2_size:]

# 分配数据到新的 DataFrame
split1_df = combined_df.iloc[split1_indices].reset_index(drop=True)
split2_df = combined_df.iloc[split2_indices].reset_index(drop=True)
split3_df = combined_df.iloc[split3_indices].reset_index(drop=True)

# 6. 保存拆分后的 DataFrame 为 Parquet 文件
split1_df.to_parquet(output_files['file1'])
split2_df.to_parquet(output_files['file2'])
split3_df.to_parquet(output_files['file3'])

print(f"拆分完成！文件保存路径：\n{output_files['file1']}\n{output_files['file2']}\n{output_files['file3']}")

拆分完成！文件保存路径：
parquet/parquet0218/split1.parquet
parquet/parquet0218/split2.parquet
parquet/parquet0218/split3.parquet


In [5]:
import pandas as pd

# 读取单个 Parquet 文件
# df = pd.read_parquet(r'parquet/val.parquet')
df = pd.read_parquet(r'parquet/parquet0218/split1.parquet')
# 显示前几行数据
print(df.head())

                                            sequence  \
0  QVQLVQSGAEVKKPGASVKVSCKASGYTFTGYVMHWVRQAPGQGLE...   
1  RITLKESGPPLVKPTQTLTLTCSFSGFSLSDFGVGVGWIRQPPGKA...   
2  QVQLVESGGGSAQPGGSLRLSCAVSGSVSELNTMGWFRQAPGKQRE...   
3  QVQLQQPGAELVKPGASVKLSCKASGYTFTSDWIHWVKQRPGHGLE...   
4  QVQLVQSGAEVKRPGSSVTVSCKASGGSFSTYALSWVRQAPGRGLE...   

                                     paratope_labels  \
0  [N, N, N, N, N, N, N, N, N, N, N, N, N, N, N, ...   
1  [N, N, N, N, N, N, N, N, N, N, N, N, N, N, N, ...   
2  [N, N, N, N, N, N, N, N, N, N, N, N, N, N, N, ...   
3  [N, N, N, N, N, N, N, N, N, N, N, N, N, N, N, ...   
4  [N, N, N, N, N, N, N, N, N, N, N, N, N, N, N, ...   

  PDB/ nano_chain/ antigen_chain   PDB nano_chain antigen_chain  
0                  7amp H anti B  7amp          H        anti B  
1                  3mod H anti P  3mod          H        anti P  
2                  7zkz C anti A  7zkz          C        anti A  
3                  6nfv A anti C  6nfv          A        anti 

In [27]:
# 使用 str.split 拆分 'PDB/ nano_chain/ antigen_chain' 列
df[['PDB', 'nano_chain', 'antigen_chain']] = df['PDB/ nano_chain/ antigen_chain'].str.split(' ', expand=True, n=2)

In [28]:
df

Unnamed: 0,sequence,paratope_labels,PDB/ nano_chain/ antigen_chain,PDB,nano_chain,antigen_chain
0,QLVESGGGLVQPGGSLRLSCAASGFAISASSIHWVRQAPGKCLEWV...,"[N, N, N, N, N, N, N, N, N, N, N, N, N, N, N, ...",5f72 T anti C,5f72,T,anti C
1,QLVESGGGLVQPGGSLRLSCAASGFAISASSIHWVRQAPGKCLEWV...,"[N, N, N, N, N, N, N, N, N, N, N, N, N, N, N, ...",5f72 S anti K,5f72,S,anti K
2,VQLKQSGPGLLQPSQRLSITCTVSGFSLGRYGVHWIRQSPGKGLEW...,"[N, N, N, N, N, N, N, N, N, N, N, N, N, N, N, ...",5ldn H anti A,5ldn,H,anti A
3,EVQLQQSGPELVKPGASVKIFCKASGYTFTDYNMDWVKQSHGKRLE...,"[N, N, N, N, N, N, N, N, N, N, N, N, N, N, N, ...",7y3j H anti A,7y3j,H,anti A
4,VQLVETGGGLVQAGGSLRLSCATSGFNFRLRTMGWYRQAPGKEREL...,"[N, N, N, N, N, N, N, N, N, N, N, N, N, N, N, ...",6x07 B anti A,6x07,B,anti A
...,...,...,...,...,...,...
576,QVQLQENGGGCVQAGGSLRLSCAASGSIFSINRMGWYRQAPGKQRE...,"[N, N, N, N, N, N, N, N, N, N, N, N, N, N, N, ...",7zmm K anti A,7zmm,K,anti A
577,QVQLQENGGGCVQAGGSLRLSCAASGSIFSINRMGWYRQAPGKQRE...,"[N, N, N, N, N, N, N, N, N, N, N, N, N, N, N, ...",7zmm E anti C,7zmm,E,anti C
578,QVQLQENGGGCVQAGGSLRLSCAASGSIFSINRMGWYRQAPGKQRE...,"[N, N, N, N, N, N, N, N, N, N, N, N, N, N, N, ...",7zmm F anti B,7zmm,F,anti B
579,QVQLQENGGGCVQAGGSLRLSCAASGSIFSINRMGWYRQAPGKQRE...,"[N, N, N, N, N, N, N, N, N, N, N, N, N, N, N, ...",7zmm G anti D,7zmm,G,anti D


In [29]:
# 保存为单个 Parquet 文件
output_parquet = r'parquet/val2025.parquet'
df.to_parquet(output_parquet, index=False)
print(f"Saved combined DataFrame to {output_parquet}")

Saved combined DataFrame to parquet/val2025.parquet
