In [None]:
import os
import time
import warnings
warnings.filterwarnings("ignore")

import duckdb
import requests
import PyBioMed
import numpy as np
import pandas as pd
import pyarrow as pa
import pyarrow.parquet as pq
import matplotlib.pyplot as plt

from multiprocessing import Pool, cpu_count
from sklearn.preprocessing import OneHotEncoder

from rdkit import Chem
from rdkit.Chem import AllChem
from rdkit.DataStructs import BitVectToText
from rdkit.Chem import Descriptors, rdFingerprintGenerator

from PyBioMed.Pyprotein import PyProtein
from PyBioMed.PyGetMol import GetProtein

In [None]:
data_dir = "/home/pervinco/Datasets/leash-bio"
save_dir = f"{data_dir}/split_sets"

train_csv = f"{data_dir}/train.csv"
test_csv = f"{data_dir}/test.csv"

train_parquet = f"{data_dir}/train.parquet"
test_parquet = f'{data_dir}/test.parquet'

target_proteins = ['sEH', 'HSA', 'BRD4']

In [None]:
df = pd.read_csv(train_csv, nrows=100)
df.to_csv('./train_samples.csv', index=False)

df = pd.read_csv(test_csv, nrows=100)
df.to_csv('./test_samples.csv', index=False)

# 1.데이터셋 분석

In [None]:
con = duckdb.connect()

## binds=0인 데이터만 load
count_binds_0 = con.query(f"""SELECT COUNT(*) 
                              FROM parquet_scan('{train_parquet}') 
                              WHERE binds = 0""").fetchone()[0]
print(f"Total binds=0 : {count_binds_0}")

## binds=1인 데이터만 load
count_binds_1 = con.query(f"""SELECT COUNT(*) 
                              FROM parquet_scan('{train_parquet}') 
                              WHERE binds = 1""").fetchone()[0]
print(f"Total binds=1 : {count_binds_1}")

## 전체 데이터 수
total_count = count_binds_0 + count_binds_1
print(f"Total data : {total_count}")

con.close()

In [None]:
## 컬럼별 고유한 값, 갯수 파악

columns = [
    'buildingblock1_smiles', 
    'buildingblock2_smiles', 
    'buildingblock3_smiles', 
    'molecule_smiles', 
    'protein_name'
]

con = duckdb.connect()
for column in columns:
    query = f"SELECT {column}, COUNT(*) as count FROM parquet_scan('{train_parquet}') GROUP BY {column}"
    df = con.query(query).df()

con.close()

In [None]:
## 컬럼별 중복 데이터 확인

con = duckdb.connect()
bb1_query = f"SELECT DISTINCT buildingblock1_smiles FROM parquet_scan('{train_parquet}')"
bb2_query = f"SELECT DISTINCT buildingblock2_smiles FROM parquet_scan('{train_parquet}')"
bb3_query = f"SELECT DISTINCT buildingblock3_smiles FROM parquet_scan('{train_parquet}')"

bb1_set = set(con.query(bb1_query).df()['buildingblock1_smiles'])
bb2_set = set(con.query(bb2_query).df()['buildingblock2_smiles'])
bb3_set = set(con.query(bb3_query).df()['buildingblock3_smiles'])

bb1_bb2_intersection = bb1_set.intersection(bb2_set)
bb1_bb3_intersection = bb1_set.intersection(bb3_set)
bb2_bb3_intersection = bb2_set.intersection(bb3_set)

print(f"Building block 1 & 2 중복 : {'있음' if bb1_bb2_intersection else '없음'}")
print(f"Building block 1 & 3 중복 : {'있음' if bb1_bb3_intersection else '없음'}")
print(f"Building block 2 & 3 중복 : {'있음' if bb2_bb3_intersection else '없음'}")

print(f"Building block 1과 2 사이의 중복된 값: {bb1_bb2_intersection}")
print(f"Building block 1과 3 사이의 중복된 값: {bb1_bb3_intersection}")
print(f"Building block 2와 3 사이의 중복된 값: {bb2_bb3_intersection}")

con.close()

In [None]:
limit = 2000

con = duckdb.connect()
data = con.query(f"""(SELECT *
                      FROM parquet_scan('{train_parquet}')
                      WHERE binds = 0
                      LIMIT {limit})
                      UNION ALL
                      (SELECT *
                      FROM parquet_scan('{train_parquet}')
                      WHERE binds = 1
                      LIMIT {limit})""").df()

con.close()
data.head()

# 2.RDKit을 활용한 분석

In [None]:
## molecule을 Morgan FingerPrint로 변환.
def compute_fingerprint(mol):
    if mol is None:
        return None
    fp = AllChem.GetMorganFingerprintAsBitVect(mol, radius=2, nBits=2048)
    return BitVectToText(fp)  # Convert to BitString for storage

In [None]:
## molecule로부터 descriptor 계산.
def calculate_descriptors(smiles):
    mol = Chem.MolFromSmiles(smiles)
    if mol is None:
        return {}
    descriptors = Descriptors.CalcMolDescriptors(mol)
    return descriptors

In [None]:
def process_row(smiles):
    mol = Chem.MolFromSmiles(smiles)
    if mol is None:
        return {'fingerprint': None, 'descriptors': {}}
    fingerprint = compute_fingerprint(mol)
    descriptors = calculate_descriptors(smiles)
    return {'fingerprint': fingerprint, 'descriptors': descriptors}

In [None]:
"""
molecule_smiles는 building block들로 조합된 약물 분자.
모델 학습을 위해서는 인코딩을 적용해 컴퓨터가 이해할 수 있는 형태로 변환해야함.
"""

OFFSET = 0
CHUNK_SIZE = 1000
train_parquet = f'{data_dir}/train.parquet'
con = duckdb.connect()

output_dir = f"{data_dir}/preprocessed"
output_file = f"{output_dir}/train.parquet"
os.makedirs(output_dir, exist_ok=True)

num_workers = cpu_count()
pool = Pool(num_workers)

first_chunk = True
while OFFSET < 3000:
    chunk = con.execute(f"""
    SELECT *
    FROM parquet_scan('{train_parquet}')
    LIMIT {CHUNK_SIZE} OFFSET {OFFSET}
    """).fetch_df()

    if chunk.empty:
        break

    smiles_list = chunk['molecule_smiles'].tolist()
    
    ## 병렬로 데이터 처리
    results = pool.map(process_row, smiles_list)

    ## 결과를 데이터프레임으로 변환
    fingerprints = [result['fingerprint'] for result in results]
    descriptors_list = [result['descriptors'] for result in results]
    
    chunk['fingerprints'] = fingerprints
    descriptor_df = pd.DataFrame(descriptors_list)
    excluded_descriptors = descriptor_df.columns[descriptor_df.isna().any()].tolist()
    descriptor_df.drop(columns=excluded_descriptors, inplace=True)
    used_descriptor = descriptor_df.columns.tolist()

    if first_chunk:
        print(f"제외된 descriptors: {excluded_descriptors}")
        print(f"사용된 descriptors: {used_descriptor}")

    chunk = pd.concat([chunk, descriptor_df], axis=1)
    table = pa.Table.from_pandas(chunk)

    if first_chunk:
        writer = pq.ParquetWriter(output_file, table.schema)
        first_chunk = False

    writer.write_table(table)
    print(f"Processed offset: {OFFSET} saved to {output_file}")
    OFFSET += CHUNK_SIZE

pool.close()
pool.join()

writer.close()
con.close()

In [None]:
df = pd.read_parquet(output_file, engine='pyarrow')
df.head()

In [None]:
print(df.columns)
print(df.shape)

# 3.Target Protein Descriptor

In [None]:
uniprot_dicts = {"sEH": "P34913", "BRD4": "O60885", "HSA": "P02768"}
output_dir = f"{data_dir}/protein_desc"
os.makedirs(output_dir, exist_ok=True)

def get_protein_sequence(uniprot_id):
    url = f"https://www.uniprot.org/uniprot/{uniprot_id}.fasta"
    response = requests.get(url)
    if response.status_code == 200:
        response_text = response.text
        lines = response_text.splitlines()
        seq = "".join(lines[1:])
        return seq
    else:
        return None

protein_seq_dicts = {}
for protein_name, uniprot_id in uniprot_dicts.items():
    protein_sequence = get_protein_sequence(uniprot_id)
    if protein_sequence:
        protein_seq_dicts[protein_name] = protein_sequence
    else:
        print(f"Failed to retrieve sequence for {protein_name} ({uniprot_id})")

ctd_features = []
for protein_name, sequence in protein_seq_dicts.items():
    protein_class = PyProtein(sequence)
    CTD = protein_class.GetCTD()
    CTD = {'protein_name': protein_name, **CTD}
    ctd_features.append(CTD)

ctd_df = pd.DataFrame(ctd_features)
ctd_df = ctd_df[['protein_name'] + [col for col in ctd_df.columns if col != 'protein_name']]  # Ensure 'protein_name' is the first column
ctd_df.to_csv(f"{output_dir}/protein_descriptors.csv", index=False)
print(ctd_df.head())

# 4.전체 데이터에 대한 FingerPrint, Descriptor 계산.

In [None]:
## molecule을 Morgan FingerPrint로 변환.
def compute_fingerprint(mol):
    if mol is None:
        return None
    fp = AllChem.GetMorganFingerprintAsBitVect(mol, radius=2, nBits=2048)
    return BitVectToText(fp)  # Convert to BitString for storage


## molecule로부터 descriptor 계산.
def calculate_descriptors(smiles):
    mol = Chem.MolFromSmiles(smiles)
    if mol is None:
        return {}
    descriptors = Descriptors.CalcMolDescriptors(mol)
    return descriptors


def process_row(smiles):
    mol = Chem.MolFromSmiles(smiles)
    if mol is None:
        return {'fingerprint': None, 'descriptors': {}}
    fingerprint = compute_fingerprint(mol)
    descriptors = calculate_descriptors(smiles)
    return {'fingerprint': fingerprint, 'descriptors': descriptors}

In [None]:
OFFSET = 0
CHUNK_SIZE = 10000
train_parquet = f'{data_dir}/train.parquet'
con = duckdb.connect()

output_dir = f"{data_dir}/preprocessed"
output_file = f"{output_dir}/train.parquet"
os.makedirs(output_dir, exist_ok=True)

ctd_df = pd.read_csv(f"{data_dir}/protein_desc/protein_descriptors.csv")

num_workers = cpu_count()
pool = Pool(num_workers)

first_chunk = True
# while OFFSET < 20000:
while True:
    start_time = time.time()
    
    chunk = con.execute(f"""
    SELECT *
    FROM parquet_scan('{train_parquet}')
    LIMIT {CHUNK_SIZE} OFFSET {OFFSET}
    """).fetch_df()

    if chunk.empty:
        break

    smiles_list = chunk['molecule_smiles'].tolist()
    
    # 병렬로 데이터 처리
    results = pool.map(process_row, smiles_list)

    # 결과를 데이터프레임으로 변환
    fingerprints = [result['fingerprint'] for result in results]
    descriptors_list = [result['descriptors'] for result in results]
    
    chunk['fingerprints'] = fingerprints
    descriptor_df = pd.DataFrame(descriptors_list)
    excluded_descriptors = descriptor_df.columns[descriptor_df.isna().any()].tolist()
    descriptor_df.drop(columns=excluded_descriptors, inplace=True)
    used_descriptor = descriptor_df.columns.tolist()

    if first_chunk:
        print(f"제외된 descriptors: {excluded_descriptors}")
        print(f"사용된 descriptors: {used_descriptor}")

    # CTD 데이터 병합 (protein_name 기준)
    merged_chunk = pd.merge(chunk, ctd_df, on='protein_name', how='left')
    merged_chunk = pd.concat([merged_chunk, descriptor_df], axis=1)
    
    table = pa.Table.from_pandas(merged_chunk)

    if first_chunk:
        writer = pq.ParquetWriter(output_file, table.schema)
        first_chunk = False

    writer.write_table(table)
    OFFSET += CHUNK_SIZE

    end_time = time.time()
    elapsed_time = end_time - start_time
    print(f"Processed offset: {OFFSET} saved to {output_file}. Time taken: {elapsed_time:.2f} seconds")

pool.close()
pool.join()

writer.close()
con.close()

# 5.binds=1인 데이터의 총량으로 10 fold 구축하기

In [None]:
## molecule을 Morgan FingerPrint로 변환.
def compute_fingerprint(mol):
    if mol is None:
        return None
    fp = AllChem.GetMorganFingerprintAsBitVect(mol, radius=2, nBits=2048)
    return BitVectToText(fp)  # Convert to BitString for storage


## molecule로부터 descriptor 계산.
def calculate_descriptors(smiles):
    mol = Chem.MolFromSmiles(smiles)
    if mol is None:
        return {}
    descriptors = Descriptors.CalcMolDescriptors(mol)
    return descriptors


def process_row(smiles):
    mol = Chem.MolFromSmiles(smiles)
    if mol is None:
        return {'fingerprint': None, 'descriptors': {}}
    fingerprint = compute_fingerprint(mol)
    descriptors = calculate_descriptors(smiles)
    return {'fingerprint': fingerprint, 'descriptors': descriptors}

In [None]:
con = duckdb.connect()

## binds=0인 데이터만 load
count_binds_0 = con.query(f"""SELECT COUNT(*) 
                              FROM parquet_scan('{train_parquet}') 
                              WHERE binds = 0""").fetchone()[0]
print(f"Total binds=0 : {count_binds_0}")

## binds=1인 데이터만 load
count_binds_1 = con.query(f"""SELECT COUNT(*) 
                              FROM parquet_scan('{train_parquet}') 
                              WHERE binds = 1""").fetchone()[0]
print(f"Total binds=1 : {count_binds_1}")

## 전체 데이터 수
total_count = count_binds_0 + count_binds_1
print(f"Total data : {total_count}")

con.close()

In [None]:
b0_chunk_size = count_binds_0 // 1800
b1_chunk_size = count_binds_1 // 10
print(b0_chunk_size)
print(b1_chunk_size)

In [None]:
output_dir = f"{data_dir}/preprocessed"
ctd_df = pd.read_csv(f"{data_dir}/protein_desc/protein_descriptors.csv")

In [None]:
def class_per_data_spliter(binds, chunk_size, parquet_path, ctd_df, process_row, output_path, max_chunk=0):
    offset = 0
    chunk_number = 0

    num_workers = cpu_count()
    pool = Pool(num_workers)

    fingerprints_folder = os.path.join(output_path, 'fingerprints')
    descriptors_folder = os.path.join(output_path, 'descriptors')
    os.makedirs(fingerprints_folder, exist_ok=True)
    os.makedirs(descriptors_folder, exist_ok=True)

    encoder = OneHotEncoder(sparse=False)
    encoder.fit(ctd_df[['protein_name']])  # ctd_df를 이용해 fit

    con = duckdb.connect()
    while True:
        if max_chunk and chunk_number == max_chunk:
            break

        start_time = time.time()

        # Phase 1: 데이터 읽기
        phase1_start = time.time()
        chunk = con.execute(f"""
        SELECT *
        FROM parquet_scan('{parquet_path}')
        WHERE binds = {binds}
        LIMIT {chunk_size}
        OFFSET {offset}
        """).fetch_df()
        phase1_end = time.time()
        print(f"Phase 1: Data read - Time taken: {phase1_end - phase1_start:.2f} seconds")

        if chunk.empty:
            break

        smiles_list = chunk['molecule_smiles'].tolist()
        protein_names = chunk['protein_name'].tolist()

        # Phase 2: 병렬 데이터 처리
        phase2_start = time.time()
        results = pool.map(process_row, smiles_list)
        phase2_end = time.time()
        print(f"Phase 2: Parallel processing - Time taken: {phase2_end - phase2_start:.2f} seconds")

        # Phase 3: 결과를 데이터프레임으로 변환
        phase3_start = time.time()
        fingerprints = [result['fingerprint'] for result in results]
        descriptors_list = [result['descriptors'] for result in results]
        phase3_end = time.time()
        print(f"Phase 3: Convert results to DataFrame - Time taken: {phase3_end - phase3_start:.2f} seconds")

        # Phase 4: fingerprints 저장
        phase4_start = time.time()
        fingerprints_df = pd.DataFrame(fingerprints)
        fingerprints_df['protein_name'] = protein_names
        fingerprints_df['binds'] = binds

        # CTD 데이터 병합 (fingerprints와 연결)
        merged_fingerprints = pd.merge(fingerprints_df, ctd_df, on='protein_name', how='left')

        # 원핫 인코딩 적용 (fingerprints)
        protein_name_encoded_fingerprints = encoder.transform(merged_fingerprints[['protein_name']])
        protein_name_encoded_df_fingerprints = pd.DataFrame(protein_name_encoded_fingerprints, columns=encoder.get_feature_names_out(['protein_name']))
        merged_fingerprints = pd.concat([merged_fingerprints, protein_name_encoded_df_fingerprints], axis=1).drop(columns=['protein_name'])

        # 제거할 컬럼이 존재하는지 확인 후 삭제
        columns_to_drop = ['buildingblock1_smiles', 'buildingblock2_smiles', 'buildingblock3_smiles', 'molecule_smiles']
        merged_fingerprints = merged_fingerprints.drop(columns=[col for col in columns_to_drop if col in merged_fingerprints.columns])

        fingerprints_file = os.path.join(fingerprints_folder, f'fingerprints_b{binds}_chunk_{chunk_number}.csv')
        merged_fingerprints.to_csv(fingerprints_file, index=False)
        phase4_end = time.time()
        print(f"Phase 4: Save fingerprints - Time taken: {phase4_end - phase4_start:.2f} seconds")
        print(fingerprints_file)

        # Phase 5: descriptors 데이터프레임 생성 및 protein_name 추가
        phase5_start = time.time()
        descriptor_df = pd.DataFrame(descriptors_list)
        descriptor_df['protein_name'] = protein_names
        descriptor_df['binds'] = binds
        excluded_descriptors = descriptor_df.columns[descriptor_df.isna().any()].tolist()
        descriptor_df.drop(columns=excluded_descriptors, inplace=True)
        used_descriptor = descriptor_df.columns.tolist()

        if chunk_number == 0:
            print(f"제외된 descriptors: {excluded_descriptors}")
            print(f"사용된 descriptors: {used_descriptor}")

        # CTD 데이터 병합 (descriptor와 연결)
        merged_descriptors = pd.merge(descriptor_df, ctd_df, on='protein_name', how='left')

        # 원핫 인코딩 적용 (descriptors)
        protein_name_encoded_descriptors = encoder.transform(merged_descriptors[['protein_name']])
        protein_name_encoded_df_descriptors = pd.DataFrame(protein_name_encoded_descriptors, columns=encoder.get_feature_names_out(['protein_name']))
        merged_descriptors = pd.concat([merged_descriptors, protein_name_encoded_df_descriptors], axis=1).drop(columns=['protein_name'])

        descriptors_file = os.path.join(descriptors_folder, f'descriptors_b{binds}_chunk_{chunk_number}.csv')
        merged_descriptors.to_csv(descriptors_file, index=False)
        phase5_end = time.time()
        print(f"Phase 5: Save descriptors - Time taken: {phase5_end - phase5_start:.2f} seconds")
        print(descriptors_file)

        offset += chunk_size
        chunk_number += 1

        end_time = time.time()
        elapsed_time = end_time - start_time
        print(f"Processed chunk: {chunk_number}, offset: {offset}, Total Time taken: {elapsed_time:.2f} seconds \n")

    pool.close()
    pool.join()
    con.close()


In [None]:
class_per_data_spliter(binds=1, 
                       chunk_size=b1_chunk_size, 
                       parquet_path=train_parquet, 
                       ctd_df=ctd_df, 
                       process_row=process_row, 
                       output_path=f"{output_dir}/binds1",
                       max_chunk=10)

In [None]:
class_per_data_spliter(binds=0, 
                       chunk_size=b0_chunk_size, 
                       parquet_path=train_parquet, 
                       ctd_df=ctd_df, 
                       process_row=process_row, 
                       output_path=f"{output_dir}/binds0",
                       max_chunk=10)

# 6.Test set

In [None]:
## molecule을 Morgan FingerPrint로 변환.
def compute_fingerprint(mol):
    if mol is None:
        return None
    fp = AllChem.GetMorganFingerprintAsBitVect(mol, radius=2, nBits=2048)
    return BitVectToText(fp)  # Convert to BitString for storage


## molecule로부터 descriptor 계산.
def calculate_descriptors(smiles):
    mol = Chem.MolFromSmiles(smiles)
    if mol is None:
        return {}
    descriptors = Descriptors.CalcMolDescriptors(mol)
    return descriptors


def process_row(smiles):
    mol = Chem.MolFromSmiles(smiles)
    if mol is None:
        return {'fingerprint': None, 'descriptors': {}}
    fingerprint = compute_fingerprint(mol)
    descriptors = calculate_descriptors(smiles)
    return {'fingerprint': fingerprint, 'descriptors': descriptors}

In [None]:
offset = 0
chunk_size = 10000
con = duckdb.connect()

output_dir = f"{data_dir}/for_test"
os.makedirs(output_dir, exist_ok=True)

ctd_df = pd.read_csv(f"{data_dir}/protein_desc/protein_descriptors.csv")

num_workers = cpu_count()
pool = Pool(num_workers)

fingerprints_folder = os.path.join(output_dir, 'fingerprints')
descriptors_folder = os.path.join(output_dir, 'descriptors')
os.makedirs(fingerprints_folder, exist_ok=True)
os.makedirs(descriptors_folder, exist_ok=True)

encoder = OneHotEncoder(sparse=False)
encoder.fit(ctd_df[['protein_name']])  # ctd_df를 이용해 fit

chunk_number = 0
while True:
    start_time = time.time()

    phase1_start = time.time()
    chunk = con.execute(f"""
    SELECT *
    FROM parquet_scan('{test_parquet}')
    LIMIT {chunk_size}
    OFFSET {offset}
    """).fetch_df()
    phase1_end = time.time()
    print(f"Phase 1: Data read - Time taken: {phase1_end - phase1_start:.2f} seconds")

    if chunk.empty:
        break

    smiles_list = chunk['molecule_smiles'].tolist()
    protein_names = chunk['protein_name'].tolist()
    ids = chunk['id'].tolist()
    buildingblock1_smiles = chunk['buildingblock1_smiles'].tolist()
    buildingblock2_smiles = chunk['buildingblock2_smiles'].tolist()
    buildingblock3_smiles = chunk['buildingblock3_smiles'].tolist()

    # Phase 2: 병렬 데이터 처리
    phase2_start = time.time()
    results = pool.map(process_row, smiles_list)
    phase2_end = time.time()
    print(f"Phase 2: Parallel processing - Time taken: {phase2_end - phase2_start:.2f} seconds")

    # Phase 3: 결과를 데이터프레임으로 변환
    phase3_start = time.time()
    fingerprints = [result['fingerprint'] for result in results]
    descriptors_list = [result['descriptors'] for result in results]
    phase3_end = time.time()
    print(f"Phase 3: Convert results to DataFrame - Time taken: {phase3_end - phase3_start:.2f} seconds")

    # Phase 4: fingerprints 저장
    phase4_start = time.time()
    fingerprints_df = pd.DataFrame({'fingerprints': fingerprints, 'protein_name': protein_names, 'id': ids,
                                    'buildingblock1_smiles': buildingblock1_smiles, 'buildingblock2_smiles': buildingblock2_smiles, 'buildingblock3_smiles': buildingblock3_smiles})

    # CTD 데이터 병합 (fingerprints와 연결)
    merged_fingerprints = pd.merge(fingerprints_df, ctd_df, on='protein_name', how='left')

    # 원핫 인코딩 적용 (fingerprints)
    protein_name_encoded_fingerprints = encoder.transform(merged_fingerprints[['protein_name']])
    protein_name_encoded_df_fingerprints = pd.DataFrame(protein_name_encoded_fingerprints, columns=encoder.get_feature_names_out(['protein_name']))
    merged_fingerprints = pd.concat([merged_fingerprints, protein_name_encoded_df_fingerprints], axis=1).drop(columns=['protein_name'])

    # 기존 데이터를 불러와서 병합
    if chunk_number > 0:
        existing_fingerprints_file = os.path.join(fingerprints_folder, f'fingerprints_chunk_{chunk_number - 1}.csv')
        existing_fingerprints_df = pd.read_csv(existing_fingerprints_file)
        merged_fingerprints = pd.concat([existing_fingerprints_df, merged_fingerprints], ignore_index=True)

    fingerprints_file = os.path.join(fingerprints_folder, f'fingerprints_chunk_{chunk_number}.csv')
    merged_fingerprints.to_csv(fingerprints_file, index=False)
    phase4_end = time.time()
    print(f"Phase 4: Save fingerprints - Time taken: {phase4_end - phase4_start:.2f} seconds")
    print(fingerprints_file)

    # Phase 5: descriptors 데이터프레임 생성 및 protein_name 추가
    phase5_start = time.time()
    descriptor_df = pd.DataFrame(descriptors_list)
    descriptor_df['protein_name'] = protein_names
    descriptor_df['id'] = ids
    descriptor_df['buildingblock1_smiles'] = buildingblock1_smiles
    descriptor_df['buildingblock2_smiles'] = buildingblock2_smiles
    descriptor_df['buildingblock3_smiles'] = buildingblock3_smiles

    excluded_descriptors = descriptor_df.columns[descriptor_df.isna().any()].tolist()
    descriptor_df.drop(columns=excluded_descriptors, inplace=True)
    used_descriptor = descriptor_df.columns.tolist()

    if chunk_number == 0:
        print(f"제외된 descriptors: {excluded_descriptors}")
        print(f"사용된 descriptors: {used_descriptor}")

    # CTD 데이터 병합 (descriptor와 연결)
    merged_descriptors = pd.merge(descriptor_df, ctd_df, on='protein_name', how='left')

    # 원핫 인코딩 적용 (descriptors)
    protein_name_encoded_descriptors = encoder.transform(merged_descriptors[['protein_name']])
    protein_name_encoded_df_descriptors = pd.DataFrame(protein_name_encoded_descriptors, columns=encoder.get_feature_names_out(['protein_name']))
    merged_descriptors = pd.concat([merged_descriptors, protein_name_encoded_df_descriptors], axis=1).drop(columns=['protein_name'])

    # 기존 데이터를 불러와서 병합
    if chunk_number > 0:
        existing_descriptors_file = os.path.join(descriptors_folder, f'descriptors_chunk_{chunk_number - 1}.csv')
        existing_descriptors_df = pd.read_csv(existing_descriptors_file)
        merged_descriptors = pd.concat([existing_descriptors_df, merged_descriptors], ignore_index=True)

    descriptors_file = os.path.join(descriptors_folder, f'descriptors_chunk_{chunk_number}.csv')
    merged_descriptors.to_csv(descriptors_file, index=False)
    phase5_end = time.time()
    print(f"Phase 5: Save descriptors - Time taken: {phase5_end - phase5_start:.2f} seconds")
    print(descriptors_file)

    offset += chunk_size
    chunk_number += 1

    end_time = time.time()
    elapsed_time = end_time - start_time
    print(f"Processed chunk: {chunk_number}, offset: {offset}, Total Time taken: {elapsed_time:.2f} seconds \n")

pool.close()
pool.join()
con.close()