In [4]:
import os
import gc
import fastparquet
import pandas as pd
import pyarrow as pa
import dask.dataframe as dd
import pyarrow.parquet as pq
from sklearn.model_selection import train_test_split

In [5]:
data_dir = "/home/pervinco/Datasets/leash-bio"
input_file = f'{data_dir}/train.parquet'
output_files = {
    'train': f'{data_dir}/train_split.parquet',
    'valid': f'{data_dir}/valid_split.parquet',
    'test': f'{data_dir}/test_split.parquet'
}

num_proteins = 3
train_size_per_protein = 98000000
valid_size_per_protein = 200000
test_size_per_protein = 360000

In [None]:
# 초기화
train_writer = None
valid_writer = None
test_writer = None

# Parquet 파일에서 데이터 row group 단위로 읽기
parquet_file = pq.ParquetFile(input_file)

for i in range(parquet_file.num_row_groups):
    # row group 단위로 데이터 읽기
    chunk = parquet_file.read_row_groups([i]).to_pandas()

    X = chunk.drop(columns=['binds'])  # 특성 데이터
    y = chunk['binds']  # 라벨 데이터

    # Train과 나머지(valid+test) 분할
    X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.2, random_state=42)
    
    # Validation과 Test 분할
    X_valid, X_test, y_valid, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)
    
    # 데이터 합치기
    train_chunk = pd.concat([X_train, y_train], axis=1)
    valid_chunk = pd.concat([X_valid, y_valid], axis=1)
    test_chunk = pd.concat([X_test, y_test], axis=1)

    # ParquetWriter 초기화 또는 데이터 추가
    if train_writer is None:
        train_writer = pq.ParquetWriter(f'{data_dir}/train_split.parquet', pa.Table.from_pandas(train_chunk).schema)
        valid_writer = pq.ParquetWriter(f'{data_dir}/valid_split.parquet', pa.Table.from_pandas(valid_chunk).schema)
        test_writer = pq.ParquetWriter(f'{data_dir}/test_split.parquet', pa.Table.from_pandas(test_chunk).schema)
    else:
        train_writer.write_table(pa.Table.from_pandas(train_chunk))
        valid_writer.write_table(pa.Table.from_pandas(valid_chunk))
        test_writer.write_table(pa.Table.from_pandas(test_chunk))

# ParquetWriter 닫기
if train_writer:
    train_writer.close()
    valid_writer.close()
    test_writer.close()


In [3]:
def check_class_distribution(file_path, chunk_size=1000000):
    # Parquet 파일 읽기 초기화
    parquet_file = pq.ParquetFile(file_path)
    num_row_groups = parquet_file.num_row_groups

    class_counts = pd.Series(dtype=int)

    # 청크 단위로 데이터 읽기
    for i in range(num_row_groups):
        chunk = parquet_file.read_row_groups([i]).to_pandas()
        class_counts = class_counts.add(chunk['binds'].value_counts(), fill_value=0)

    # 클래스 분포 계산
    total = class_counts.sum()
    distribution = (class_counts / total) * 100

    # binds=0과 binds=1의 갯수 출력
    binds_0_count = class_counts.get(0, 0)
    binds_1_count = class_counts.get(1, 0)
    print(f"binds=0: {binds_0_count}, binds=1: {binds_1_count}")

    return distribution

train_file = f'{data_dir}/train_split.parquet'
valid_file = f'{data_dir}/valid_split.parquet'
test_file = f'{data_dir}/test_split.parquet'

train_distribution = check_class_distribution(train_file)
valid_distribution = check_class_distribution(valid_file)
test_distribution = check_class_distribution(test_file)

print("Train Set Class Distribution:")
print(train_distribution)
print("\nValidation Set Class Distribution:")
print(valid_distribution)
print("\nTest Set Class Distribution:")
print(test_distribution)


binds=0: 234087381.0, binds=1: 1270998.0
binds=0: 29261734.0, binds=1: 158203.0
binds=0: 29261916.0, binds=1: 158022.0
Train Set Class Distribution:
binds
0    99.459973
1     0.540027
dtype: float64

Validation Set Class Distribution:
binds
0    99.462259
1     0.537741
dtype: float64

Test Set Class Distribution:
binds
0    99.462874
1     0.537126
dtype: float64


In [6]:
def read_unique_molecules(file_path):
    parquet_file = pq.ParquetFile(file_path)
    unique_molecules = set()

    for i in range(parquet_file.num_row_groups):
        chunk = parquet_file.read_row_groups([i], columns=['molecule_smiles']).to_pandas()
        unique_molecules.update(chunk['molecule_smiles'])

    return unique_molecules

unique_molecules_file = f"{data_dir}/preprocessed/molecule_smiles_uniques.parquet"
unique_molecules = read_unique_molecules(unique_molecules_file)
print(f"Total unique molecules: {len(unique_molecules)}")
print(unique_molecules)

In [1]:
def check_unique_molecules(file_path, unique_molecules):
    # Parquet 파일 읽기 초기화
    parquet_file = pq.ParquetFile(file_path)
    num_row_groups = parquet_file.num_row_groups

    molecules_in_file = set()

    # 청크 단위로 데이터 읽기
    for i in range(num_row_groups):
        chunk = parquet_file.read_row_groups([i]).to_pandas()
        molecules_in_file.update(chunk['molecule_smiles'])

    # 고유 molecule 값 포함 여부 확인
    missing_molecules = unique_molecules - molecules_in_file
    return missing_molecules

: 

In [None]:
train_file = f'{data_dir}/train_split.parquet'
missing_in_train = check_unique_molecules(train_file, unique_molecules)

if not missing_in_train:
    print("All unique molecules are present in the train set.")
else:
    print(f"Missing molecules in the train set: {missing_in_train}")

del missing_in_train
gc.collect()

In [None]:
valid_file = f'{data_dir}/valid_split.parquet'
missing_in_valid = check_unique_molecules(valid_file, unique_molecules)

if not missing_in_valid:
    print("All unique molecules are present in the valid set.")
else:
    print(f"Missing molecules in the valid set: {missing_in_valid}")

del missing_in_train
gc.collect()

In [None]:
test_file = f'{data_dir}/test_split.parquet'
missing_in_test = check_unique_molecules(test_file, unique_molecules)

if not missing_in_test:
    print("All unique molecules are present in the test set.")
else:
    print(f"Missing molecules in the test set: {missing_in_test}")

del missing_in_train
gc.collect()