In [1]:
import os
import warnings
warnings.filterwarnings("ignore")

import joblib
import duckdb
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from rdkit import Chem
from rdkit.Chem import DataStructs, Descriptors, rdFingerprintGenerator

from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler, StandardScaler
from sklearn.metrics import accuracy_score, classification_report, roc_auc_score

In [2]:
model_save_dir = f"/home/pervinco/Models/leash_bio"
os.makedirs(f"{model_save_dir}/weights", exist_ok=True)
os.makedirs(f"{model_save_dir}/utils", exist_ok=True)

submission_dir = "./submission.csv"

data_dir = "/home/pervinco/Datasets/leash-bio"
train_csv = f"{data_dir}/train.csv"
test_csv = f"{data_dir}/test.csv"
train_parquet = f"{data_dir}/train.parquet"
test_parquet = f'{data_dir}/test.parquet'

CHUNK_SIZE = 100000
OFFSET = 0
NUM_MODELS = 10
FIRST_CHUNK = True

con = duckdb.connect()

In [3]:
descriptor_names = [desc[0] for desc in Descriptors._descList]
descriptor_functions = [desc[1] for desc in Descriptors._descList]

print(len(descriptor_names))
print(descriptor_names)

210
['MaxAbsEStateIndex', 'MaxEStateIndex', 'MinAbsEStateIndex', 'MinEStateIndex', 'qed', 'SPS', 'MolWt', 'HeavyAtomMolWt', 'ExactMolWt', 'NumValenceElectrons', 'NumRadicalElectrons', 'MaxPartialCharge', 'MinPartialCharge', 'MaxAbsPartialCharge', 'MinAbsPartialCharge', 'FpDensityMorgan1', 'FpDensityMorgan2', 'FpDensityMorgan3', 'BCUT2D_MWHI', 'BCUT2D_MWLOW', 'BCUT2D_CHGHI', 'BCUT2D_CHGLO', 'BCUT2D_LOGPHI', 'BCUT2D_LOGPLOW', 'BCUT2D_MRHI', 'BCUT2D_MRLOW', 'AvgIpc', 'BalabanJ', 'BertzCT', 'Chi0', 'Chi0n', 'Chi0v', 'Chi1', 'Chi1n', 'Chi1v', 'Chi2n', 'Chi2v', 'Chi3n', 'Chi3v', 'Chi4n', 'Chi4v', 'HallKierAlpha', 'Ipc', 'Kappa1', 'Kappa2', 'Kappa3', 'LabuteASA', 'PEOE_VSA1', 'PEOE_VSA10', 'PEOE_VSA11', 'PEOE_VSA12', 'PEOE_VSA13', 'PEOE_VSA14', 'PEOE_VSA2', 'PEOE_VSA3', 'PEOE_VSA4', 'PEOE_VSA5', 'PEOE_VSA6', 'PEOE_VSA7', 'PEOE_VSA8', 'PEOE_VSA9', 'SMR_VSA1', 'SMR_VSA10', 'SMR_VSA2', 'SMR_VSA3', 'SMR_VSA4', 'SMR_VSA5', 'SMR_VSA6', 'SMR_VSA7', 'SMR_VSA8', 'SMR_VSA9', 'SlogP_VSA1', 'SlogP_VSA10'

In [4]:
def fetch_data(binds, offset, chunk_size):
    query = f"""
    SELECT *
    FROM parquet_scan('{train_parquet}')
    WHERE binds = {binds}
    ORDER BY random()
    LIMIT {chunk_size} OFFSET {offset}
    """
    return con.query(query).df()

def calculate_descriptors(smiles):
    mol = Chem.MolFromSmiles(smiles)
    if mol is None:
        return [float('nan')] * len(descriptor_names)
    descriptors = [desc(mol) for desc in descriptor_functions]
    return descriptors


scaler = StandardScaler()
encoder = OneHotEncoder()

In [5]:
#### Train CLASS
#### {'sEH', 'HSA', 'BRD4'}

# offset = 0
# unique_protein_names = set()

# while True:
#     chunk = con.query(f"""
#     SELECT protein_name
#     FROM parquet_scan('{train_parquet}')
#     LIMIT {CHUNK_SIZE} OFFSET {offset}
#     """).df()
    
#     if chunk.empty:
#         break
    
#     unique_protein_names.update(chunk['protein_name'].unique())
#     offset += CHUNK_SIZE

# print(unique_protein_names)

In [6]:
for idx in range(NUM_MODELS):
    chunk = con.query(f"""
    SELECT *
    FROM parquet_scan('{train_parquet}')
    ORDER BY random()
    LIMIT {CHUNK_SIZE} OFFSET {OFFSET}
    """).df()
    
    descriptor_df = pd.DataFrame(chunk['molecule_smiles'].apply(calculate_descriptors).tolist(), columns=descriptor_names)    
    excluded_descriptors = descriptor_df.columns[descriptor_df.isna().any()].tolist()
    descriptor_df.drop(columns=excluded_descriptors, inplace=True)
    
    used_descriptor = descriptor_df.columns.tolist()
    
    print(f"제외된 descriptors: {excluded_descriptors}")
    print(f"사용된 descriptors: {used_descriptor}")
    chunk = pd.concat([chunk, descriptor_df], axis=1)
    
    protein_name_encoded = encoder.fit_transform(chunk[['protein_name']]) if idx == 0 else encoder.transform(chunk[['protein_name']])
    protein_name_encoded_df = pd.DataFrame(protein_name_encoded.toarray(), columns=encoder.get_feature_names_out(['protein_name']))
    
    chunk = pd.concat([chunk, protein_name_encoded_df], axis=1)
    X = chunk[used_descriptor + list(protein_name_encoded_df.columns)]
    y = chunk['binds']
    
    if idx == 0:
        X_scaled = scaler.fit_transform(X)
        joblib.dump(scaler, f"{model_save_dir}/utils/scaler.joblib")
        joblib.dump(encoder, f"{model_save_dir}/utils/encoder.joblib")
    else:
        X_scaled = scaler.transform(X)
    
    X_train, X_val, y_train, y_val = train_test_split(X_scaled, y, test_size=0.2, random_state=42)
    model = RandomForestClassifier()
    model.fit(X_train, y_train)
    
    y_pred = model.predict(X_val)
    y_proba = model.predict_proba(X_val)[:, 1]
    
    accuracy = accuracy_score(y_val, y_pred)
    roc_auc = roc_auc_score(y_val, y_proba)
    
    print(f"Model {idx} - Accuracy: {accuracy}")
    print(f"Model {idx} - ROC AUC: {roc_auc}")
    print()
    
    joblib.dump(model, f"{model_save_dir}/weights/model_{idx}.joblib")
    idx += 1
    OFFSET += CHUNK_SIZE

con.close()

제외된 descriptors: ['MaxPartialCharge', 'MinPartialCharge', 'MaxAbsPartialCharge', 'MinAbsPartialCharge', 'BCUT2D_MWHI', 'BCUT2D_MWLOW', 'BCUT2D_CHGHI', 'BCUT2D_CHGLO', 'BCUT2D_LOGPHI', 'BCUT2D_LOGPLOW', 'BCUT2D_MRHI', 'BCUT2D_MRLOW']
사용된 descriptors: ['MaxAbsEStateIndex', 'MaxEStateIndex', 'MinAbsEStateIndex', 'MinEStateIndex', 'qed', 'SPS', 'MolWt', 'HeavyAtomMolWt', 'ExactMolWt', 'NumValenceElectrons', 'NumRadicalElectrons', 'FpDensityMorgan1', 'FpDensityMorgan2', 'FpDensityMorgan3', 'AvgIpc', 'BalabanJ', 'BertzCT', 'Chi0', 'Chi0n', 'Chi0v', 'Chi1', 'Chi1n', 'Chi1v', 'Chi2n', 'Chi2v', 'Chi3n', 'Chi3v', 'Chi4n', 'Chi4v', 'HallKierAlpha', 'Ipc', 'Kappa1', 'Kappa2', 'Kappa3', 'LabuteASA', 'PEOE_VSA1', 'PEOE_VSA10', 'PEOE_VSA11', 'PEOE_VSA12', 'PEOE_VSA13', 'PEOE_VSA14', 'PEOE_VSA2', 'PEOE_VSA3', 'PEOE_VSA4', 'PEOE_VSA5', 'PEOE_VSA6', 'PEOE_VSA7', 'PEOE_VSA8', 'PEOE_VSA9', 'SMR_VSA1', 'SMR_VSA10', 'SMR_VSA2', 'SMR_VSA3', 'SMR_VSA4', 'SMR_VSA5', 'SMR_VSA6', 'SMR_VSA7', 'SMR_VSA8', 'SMR_VSA

In [9]:
num_models = len(os.listdir(f"{model_save_dir}/weights"))
print(num_models)

scaler = joblib.load(f"{model_save_dir}/utils/scaler.joblib")
encoder = joblib.load(f"{model_save_dir}/utils/encoder.joblib")

with open(submission_dir, 'w') as f:
    f.write('id,binds\n')

10


In [10]:
# # #### Test CLASS
# # #### {'sEH', 'HSA', 'BRD4'}

# offset = 0
# unique_protein_names = set()

# while True:
#     chunk = con.query(f"""
#     SELECT protein_name
#     FROM parquet_scan('{train_parquet}')
#     LIMIT {CHUNK_SIZE} OFFSET {offset}
#     """).df()
    
#     if chunk.empty:
#         break
    
#     unique_protein_names.update(chunk['protein_name'].unique())
#     offset += CHUNK_SIZE

# print(unique_protein_names)

{'sEH', 'HSA', 'BRD4'}


In [None]:
while True:
    chunk = con.query(f"""
    SELECT *
    FROM parquet_scan('{test_parquet}')
    LIMIT {CHUNK_SIZE} OFFSET {OFFSET}
    """).df()
    
    if chunk.empty:
        break
    
    descriptor_df = pd.DataFrame(chunk['molecule_smiles'].apply(calculate_descriptors).tolist(), columns=descriptor_names)
    excluded_descriptors = descriptor_df.columns[descriptor_df.isna().any()].tolist()
    descriptor_df.drop(columns=excluded_descriptors, inplace=True)
    used_descriptor = descriptor_df.columns.tolist()
    
    chunk = pd.concat([chunk, descriptor_df], axis=1)
    protein_name_encoded = encoder.transform(chunk[['protein_name']])
    protein_name_encoded_df = pd.DataFrame(protein_name_encoded.toarray(), columns=encoder.get_feature_names_out(['protein_name']))
    chunk = pd.concat([chunk, protein_name_encoded_df], axis=1)
    
    X_test = chunk[used_descriptor + list(protein_name_encoded_df.columns)]
    X_test_scaled = scaler.transform(X_test)
    
    predictions = np.zeros(X_test_scaled.shape[0])
    for i in range(1, num_models + 1):
        model = joblib.load(f"{model_save_dir}/model_{i}.joblib")
        predictions += model.predict_proba(X_test_scaled)[:, 1]
    
    # 평균화하여 최종 예측
    predictions /= num_models
    
    chunk_submission = pd.DataFrame({'id': chunk['id'], 'binds': predictions})    
    chunk_submission.to_csv(submission_dir, mode='a', header=False, index=False)
    
    OFFSET += CHUNK_SIZE

con.close()