In [None]:
import os
import time
import warnings
warnings.filterwarnings("ignore")

import duckdb
import requests
import PyBioMed
import numpy as np
import pandas as pd
import pyarrow as pa
import pyarrow.parquet as pq
import matplotlib.pyplot as plt

from multiprocessing import Pool, cpu_count

from rdkit import Chem
from rdkit.Chem import AllChem
from rdkit.DataStructs import BitVectToText
from rdkit.Chem import Descriptors, rdFingerprintGenerator

from PyBioMed.Pyprotein import PyProtein
from PyBioMed.PyGetMol import GetProtein

In [None]:
data_dir = "/home/pervinco/Datasets/leash-bio"
save_dir = f"{data_dir}/split_sets"

train_csv = f"{data_dir}/train.csv"
test_csv = f"{data_dir}/test.csv"

train_parquet = f"{data_dir}/train.parquet"
test_parquet = f'{data_dir}/test.parquet'

target_proteins = ['sEH', 'HSA', 'BRD4']

# 1.데이터셋 분석

In [None]:
con = duckdb.connect()

## binds=0인 데이터만 load
count_binds_0 = con.query(f"""SELECT COUNT(*) 
                              FROM parquet_scan('{train_parquet}') 
                              WHERE binds = 0""").fetchone()[0]
print(f"Total binds=0 : {count_binds_0}")

## binds=1인 데이터만 load
count_binds_1 = con.query(f"""SELECT COUNT(*) 
                              FROM parquet_scan('{train_parquet}') 
                              WHERE binds = 1""").fetchone()[0]
print(f"Total binds=1 : {count_binds_1}")

## 전체 데이터 수
total_count = count_binds_0 + count_binds_1
print(f"Total data : {total_count}")

con.close()

In [None]:
## 컬럼별 고유한 값, 갯수 파악

columns = [
    'buildingblock1_smiles', 
    'buildingblock2_smiles', 
    'buildingblock3_smiles', 
    'molecule_smiles', 
    'protein_name'
]

con = duckdb.connect()
for column in columns:
    query = f"SELECT {column}, COUNT(*) as count FROM parquet_scan('{train_parquet}') GROUP BY {column}"
    df = con.query(query).df()

    df.to_csv(f"{data_dir}/info/{column}_info.csv", index=False)

con.close()

In [None]:
## 컬럼별 중복 데이터 확인

con = duckdb.connect()
bb1_query = f"SELECT DISTINCT buildingblock1_smiles FROM parquet_scan('{train_parquet}')"
bb2_query = f"SELECT DISTINCT buildingblock2_smiles FROM parquet_scan('{train_parquet}')"
bb3_query = f"SELECT DISTINCT buildingblock3_smiles FROM parquet_scan('{train_parquet}')"

bb1_set = set(con.query(bb1_query).df()['buildingblock1_smiles'])
bb2_set = set(con.query(bb2_query).df()['buildingblock2_smiles'])
bb3_set = set(con.query(bb3_query).df()['buildingblock3_smiles'])

bb1_bb2_intersection = bb1_set.intersection(bb2_set)
bb1_bb3_intersection = bb1_set.intersection(bb3_set)
bb2_bb3_intersection = bb2_set.intersection(bb3_set)

print(f"Building block 1 & 2 중복 : {'있음' if bb1_bb2_intersection else '없음'}")
print(f"Building block 1 & 3 중복 : {'있음' if bb1_bb3_intersection else '없음'}")
print(f"Building block 2 & 3 중복 : {'있음' if bb2_bb3_intersection else '없음'}")

print(f"Building block 1과 2 사이의 중복된 값: {bb1_bb2_intersection}")
print(f"Building block 1과 3 사이의 중복된 값: {bb1_bb3_intersection}")
print(f"Building block 2와 3 사이의 중복된 값: {bb2_bb3_intersection}")

con.close()

In [None]:
limit = 2000

con = duckdb.connect()
data = con.query(f"""(SELECT *
                      FROM parquet_scan('{train_parquet}')
                      WHERE binds = 0
                      LIMIT {limit})
                      UNION ALL
                      (SELECT *
                      FROM parquet_scan('{train_parquet}')
                      WHERE binds = 1
                      LIMIT {limit})""").df()

con.close()
data.head()

# 2.RDKit을 활용한 분석

In [None]:
fpg = rdFingerprintGenerator.GetMorganGenerator(radius=2, fpSize=2048)

## molecule을 Morgan FingerPrint로 변환.
def compute_fingerprint(mol):
    if mol is None:
        return None
    fp = AllChem.GetMorganFingerprintAsBitVect(mol, radius=2, nBits=2048)
    return BitVectToText(fp)  # Convert to BitString for storage

In [None]:
## molecule로부터 descriptor 계산.
def calculate_descriptors(smiles):
    mol = Chem.MolFromSmiles(smiles)
    if mol is None:
        return {}
    descriptors = Descriptors.CalcMolDescriptors(mol)
    return descriptors

In [None]:
def process_row(smiles):
    mol = Chem.MolFromSmiles(smiles)
    if mol is None:
        return {'fingerprint': None, 'descriptors': {}}
    fingerprint = compute_fingerprint(mol)
    descriptors = calculate_descriptors(smiles)
    return {'fingerprint': fingerprint, 'descriptors': descriptors}

In [None]:
"""
molecule_smiles는 building block들로 조합된 약물 분자.
모델 학습을 위해서는 인코딩을 적용해 컴퓨터가 이해할 수 있는 형태로 변환해야함.
"""

OFFSET = 0
CHUNK_SIZE = 1000
train_parquet = f'{data_dir}/train.parquet'
con = duckdb.connect()

output_dir = f"{data_dir}/preprocessed"
output_file = f"{output_dir}/train.parquet"
os.makedirs(output_dir, exist_ok=True)

num_workers = cpu_count()
pool = Pool(num_workers)

first_chunk = True
while OFFSET < 3000:
    chunk = con.execute(f"""
    SELECT *
    FROM parquet_scan('{train_parquet}')
    LIMIT {CHUNK_SIZE} OFFSET {OFFSET}
    """).fetch_df()

    if chunk.empty:
        break

    smiles_list = chunk['molecule_smiles'].tolist()
    
    ## 병렬로 데이터 처리
    results = pool.map(process_row, smiles_list)

    ## 결과를 데이터프레임으로 변환
    fingerprints = [result['fingerprint'] for result in results]
    descriptors_list = [result['descriptors'] for result in results]
    
    chunk['fingerprints'] = fingerprints
    descriptor_df = pd.DataFrame(descriptors_list)
    excluded_descriptors = descriptor_df.columns[descriptor_df.isna().any()].tolist()
    descriptor_df.drop(columns=excluded_descriptors, inplace=True)
    used_descriptor = descriptor_df.columns.tolist()

    if first_chunk:
        print(f"제외된 descriptors: {excluded_descriptors}")
        print(f"사용된 descriptors: {used_descriptor}")

    chunk = pd.concat([chunk, descriptor_df], axis=1)
    table = pa.Table.from_pandas(chunk)

    if first_chunk:
        writer = pq.ParquetWriter(output_file, table.schema)
        first_chunk = False

    writer.write_table(table)
    print(f"Processed offset: {OFFSET} saved to {output_file}")
    OFFSET += CHUNK_SIZE

pool.close()
pool.join()

writer.close()
con.close()

In [None]:
df = pd.read_parquet(output_file, engine='pyarrow', )
df.head()

In [None]:
print(df.columns)
print(df.shape)

# 3.Target Protein Descriptor

In [None]:
uniprot_dicts = {"sEH": "P34913", "BRD4": "O60885", "ALB": "P02768"}
output_dir = f"{data_dir}/protein_desc"
os.makedirs(output_dir, exist_ok=True)

def get_protein_sequence(uniprot_id):
    url = f"https://www.uniprot.org/uniprot/{uniprot_id}.fasta"
    response = requests.get(url)
    if response.status_code == 200:
        response_text = response.text
        lines = response_text.splitlines()
        seq = "".join(lines[1:])
        return seq
    else:
        return None

protein_seq_dicts = {}
for protein_name, uniprot_id in uniprot_dicts.items():
    protein_sequence = get_protein_sequence(uniprot_id)
    if protein_sequence:
        protein_seq_dicts[protein_name] = protein_sequence
    else:
        print(f"Failed to retrieve sequence for {protein_name} ({uniprot_id})")

ctd_features = {}
for protein_name, sequence in protein_seq_dicts.items():
    protein_class = PyProtein(sequence)
    CTD = protein_class.GetCTD()
    ctd_features[protein_name] = CTD

ctd_df = pd.DataFrame(ctd_features).T
ctd_df.to_csv(f"{output_dir}/protein_descriptors.csv")
ctd_df.head()

In [9]:
OFFSET = 0
CHUNK_SIZE = 10000
train_parquet = f'{data_dir}/train.parquet'
con = duckdb.connect()

output_dir = f"{data_dir}/preprocessed"
output_file = f"{output_dir}/train.parquet"
os.makedirs(output_dir, exist_ok=True)

ctd_df = pd.read_csv(f"{data_dir}/protein_desc/protein_descriptors.csv")

num_workers = cpu_count()
pool = Pool(num_workers)

first_chunk = True
# while OFFSET < 20000:
while True:
    start_time = time.time()
    
    chunk = con.execute(f"""
    SELECT *
    FROM parquet_scan('{train_parquet}')
    LIMIT {CHUNK_SIZE} OFFSET {OFFSET}
    """).fetch_df()

    if chunk.empty:
        break

    smiles_list = chunk['molecule_smiles'].tolist()
    
    # 병렬로 데이터 처리
    results = pool.map(process_row, smiles_list)

    # 결과를 데이터프레임으로 변환
    fingerprints = [result['fingerprint'] for result in results]
    descriptors_list = [result['descriptors'] for result in results]
    
    chunk['fingerprints'] = fingerprints
    descriptor_df = pd.DataFrame(descriptors_list)
    excluded_descriptors = descriptor_df.columns[descriptor_df.isna().any()].tolist()
    descriptor_df.drop(columns=excluded_descriptors, inplace=True)
    used_descriptor = descriptor_df.columns.tolist()

    if first_chunk:
        print(f"제외된 descriptors: {excluded_descriptors}")
        print(f"사용된 descriptors: {used_descriptor}")

    # CTD 특징 및 One-Hot Encoding 추가
    chunk = pd.concat([chunk, descriptor_df, ctd_df.reset_index(drop=True)], axis=1)
    table = pa.Table.from_pandas(chunk)

    if first_chunk:
        writer = pq.ParquetWriter(output_file, table.schema)
        first_chunk = False

    writer.write_table(table)
    OFFSET += CHUNK_SIZE

    end_time = time.time()
    elapsed_time = end_time - start_time
    print(f"Processed offset: {OFFSET} saved to {output_file}. Time taken: {elapsed_time:.2f} seconds")

pool.close()
pool.join()

writer.close()
con.close()

제외된 descriptors: ['MaxPartialCharge', 'MinPartialCharge', 'MaxAbsPartialCharge', 'MinAbsPartialCharge', 'BCUT2D_MWHI', 'BCUT2D_MWLOW', 'BCUT2D_CHGHI', 'BCUT2D_CHGLO', 'BCUT2D_LOGPHI', 'BCUT2D_LOGPLOW', 'BCUT2D_MRHI', 'BCUT2D_MRLOW']
사용된 descriptors: ['MaxAbsEStateIndex', 'MaxEStateIndex', 'MinAbsEStateIndex', 'MinEStateIndex', 'qed', 'SPS', 'MolWt', 'HeavyAtomMolWt', 'ExactMolWt', 'NumValenceElectrons', 'NumRadicalElectrons', 'FpDensityMorgan1', 'FpDensityMorgan2', 'FpDensityMorgan3', 'AvgIpc', 'BalabanJ', 'BertzCT', 'Chi0', 'Chi0n', 'Chi0v', 'Chi1', 'Chi1n', 'Chi1v', 'Chi2n', 'Chi2v', 'Chi3n', 'Chi3v', 'Chi4n', 'Chi4v', 'HallKierAlpha', 'Ipc', 'Kappa1', 'Kappa2', 'Kappa3', 'LabuteASA', 'PEOE_VSA1', 'PEOE_VSA10', 'PEOE_VSA11', 'PEOE_VSA12', 'PEOE_VSA13', 'PEOE_VSA14', 'PEOE_VSA2', 'PEOE_VSA3', 'PEOE_VSA4', 'PEOE_VSA5', 'PEOE_VSA6', 'PEOE_VSA7', 'PEOE_VSA8', 'PEOE_VSA9', 'SMR_VSA1', 'SMR_VSA10', 'SMR_VSA2', 'SMR_VSA3', 'SMR_VSA4', 'SMR_VSA5', 'SMR_VSA6', 'SMR_VSA7', 'SMR_VSA8', 'SMR_VSA

In [8]:
df = pd.read_parquet("/home/pervinco/Datasets/leash-bio/preprocessed/train.parquet", engine='pyarrow')
df.head()

Unnamed: 0,id,buildingblock1_smiles,buildingblock2_smiles,buildingblock3_smiles,molecule_smiles,protein_name,binds,fingerprints,MaxAbsEStateIndex,MaxEStateIndex,...,_HydrophobicityD2001,_HydrophobicityD2025,_HydrophobicityD2050,_HydrophobicityD2075,_HydrophobicityD2100,_HydrophobicityD3001,_HydrophobicityD3025,_HydrophobicityD3050,_HydrophobicityD3075,_HydrophobicityD3100
0,0,C#CC[C@@H](CC(=O)O)NC(=O)OCC1c2ccccc2-c2ccccc21,C#CCOc1ccc(CN)cc1.Cl,Br.Br.NCC1CCCN1c1cccnn1,C#CCOc1ccc(CNc2nc(NCC3CCCN3c3cccnn3)nc(N[C@@H]...,BRD4,0,0100000000000000000000000000000000000000010000...,12.022584,12.022584,...,0.36,26.306,49.189,70.811,99.64,0.18,25.225,48.829,74.955,100.0
1,1,C#CC[C@@H](CC(=O)O)NC(=O)OCC1c2ccccc2-c2ccccc21,C#CCOc1ccc(CN)cc1.Cl,Br.Br.NCC1CCCN1c1cccnn1,C#CCOc1ccc(CNc2nc(NCC3CCCN3c3cccnn3)nc(N[C@@H]...,HSA,0,0100000000000000000000000000000000000000010000...,12.022584,12.022584,...,0.147,23.789,53.01,72.687,99.486,0.073,19.677,45.668,69.604,100.0
2,2,C#CC[C@@H](CC(=O)O)NC(=O)OCC1c2ccccc2-c2ccccc21,C#CCOc1ccc(CN)cc1.Cl,Br.Br.NCC1CCCN1c1cccnn1,C#CCOc1ccc(CNc2nc(NCC3CCCN3c3cccnn3)nc(N[C@@H]...,sEH,0,0100000000000000000000000000000000000000010000...,12.022584,12.022584,...,0.821,26.601,48.768,74.713,99.836,0.164,22.989,49.754,74.548,100.0
3,3,C#CC[C@@H](CC(=O)O)NC(=O)OCC1c2ccccc2-c2ccccc21,C#CCOc1ccc(CN)cc1.Cl,Br.NCc1cccc(Br)n1,C#CCOc1ccc(CNc2nc(NCc3cccc(Br)n3)nc(N[C@@H](CC...,BRD4,0,0100000000000000000000000000000000000000010000...,11.915654,11.915654,...,,,,,,,,,,
4,4,C#CC[C@@H](CC(=O)O)NC(=O)OCC1c2ccccc2-c2ccccc21,C#CCOc1ccc(CN)cc1.Cl,Br.NCc1cccc(Br)n1,C#CCOc1ccc(CNc2nc(NCc3cccc(Br)n3)nc(N[C@@H](CC...,HSA,0,0100000000000000000000000000000000000000010000...,11.915654,11.915654,...,,,,,,,,,,
