In [5]:
import os
import random
import duckdb
import joblib
import numpy as np
import pandas as pd
import lightgbm as lgb

from glob import glob
from multiprocessing import Pool, cpu_count

from rdkit import Chem
from rdkit.Chem import AllChem

from sklearn.metrics import classification_report
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.model_selection import train_test_split, cross_val_score

In [6]:
data_dir = "/home/pervinco/Datasets/leash-bio/preprocessed"
save_dir = "/home/pervinco/Models/leash_bio"
os.makedirs(f"{save_dir}/weights", exist_ok=True)
os.makedirs(f"{save_dir}/utils", exist_ok=True)

In [7]:
b1_desc_files = sorted(glob(f"{data_dir}/binds1/descriptors/*.csv"))
b1_fp_files = sorted(glob(f"{data_dir}/binds1/fingerprints/*.csv"))
print(len(b1_desc_files), len(b1_fp_files))
print(b1_fp_files)
print(b1_desc_files, '\n')

b0_desc_files = sorted(glob(f"{data_dir}/binds0/descriptors/*.csv"))
b0_fp_files = sorted(glob(f"{data_dir}/binds0/fingerprints/*.csv"))
print(len(b0_desc_files), len(b0_fp_files))
print(b0_fp_files)
print(b0_desc_files)

10 10
['/home/pervinco/Datasets/leash-bio/preprocessed/binds1/fingerprints/fingerprints_b1_chunk_0.csv', '/home/pervinco/Datasets/leash-bio/preprocessed/binds1/fingerprints/fingerprints_b1_chunk_1.csv', '/home/pervinco/Datasets/leash-bio/preprocessed/binds1/fingerprints/fingerprints_b1_chunk_2.csv', '/home/pervinco/Datasets/leash-bio/preprocessed/binds1/fingerprints/fingerprints_b1_chunk_3.csv', '/home/pervinco/Datasets/leash-bio/preprocessed/binds1/fingerprints/fingerprints_b1_chunk_4.csv', '/home/pervinco/Datasets/leash-bio/preprocessed/binds1/fingerprints/fingerprints_b1_chunk_5.csv', '/home/pervinco/Datasets/leash-bio/preprocessed/binds1/fingerprints/fingerprints_b1_chunk_6.csv', '/home/pervinco/Datasets/leash-bio/preprocessed/binds1/fingerprints/fingerprints_b1_chunk_7.csv', '/home/pervinco/Datasets/leash-bio/preprocessed/binds1/fingerprints/fingerprints_b1_chunk_8.csv', '/home/pervinco/Datasets/leash-bio/preprocessed/binds1/fingerprints/fingerprints_b1_chunk_9.csv']
['/home/pervi

In [8]:
df = pd.read_csv(b0_fp_files[0])
df.head()

Unnamed: 0,0,binds,_PolarizabilityC1,_PolarizabilityC2,_PolarizabilityC3,_SolventAccessibilityC1,_SolventAccessibilityC2,_SolventAccessibilityC3,_SecondaryStrC1,_SecondaryStrC2,...,_HydrophobicityD2075,_HydrophobicityD2100,_HydrophobicityD3001,_HydrophobicityD3025,_HydrophobicityD3050,_HydrophobicityD3075,_HydrophobicityD3100,protein_name_BRD4,protein_name_HSA,protein_name_sEH
0,0100100000000000000100000001000000000000010000...,0,0.251,0.526,0.222,0.258,0.372,0.369,0.5,0.164,...,72.687,99.486,0.073,19.677,45.668,69.604,100.0,1.0,0.0,0.0
1,0100100000000000000100000001000000000000010000...,0,0.278,0.45,0.273,0.433,0.365,0.202,0.524,0.282,...,74.713,99.836,0.164,22.989,49.754,74.548,100.0,0.0,1.0,0.0
2,0100100000000000000100000001000000000000010000...,0,0.283,0.456,0.261,0.445,0.301,0.254,0.468,0.268,...,70.811,99.64,0.18,25.225,48.829,74.955,100.0,0.0,0.0,1.0
3,0100000000000001000000000001000000000000010000...,0,0.251,0.526,0.222,0.258,0.372,0.369,0.5,0.164,...,72.687,99.486,0.073,19.677,45.668,69.604,100.0,1.0,0.0,0.0
4,0100000000000001000000000001000000000000010000...,0,0.278,0.45,0.273,0.433,0.365,0.202,0.524,0.282,...,74.713,99.836,0.164,22.989,49.754,74.548,100.0,0.0,1.0,0.0


In [None]:
df = pd.read_csv(b1_fp_files[0])
df.head()

# 1.Descriptor Models

In [None]:
scaler = MinMaxScaler()

for b0_desc_file, b1_desc_file in zip(b0_desc_files, b1_desc_files):
    print(b0_desc_file, b1_desc_file)
    b0_df = pd.read_csv(b0_desc_file)
    b1_df = pd.read_csv(b1_desc_file)
    print(b0_df.shape)
    print(b1_df.shape)
    
    combined_df = pd.concat([b0_df, b1_df], ignore_index=True)
    combined_df = combined_df.sample(frac=1).reset_index(drop=True)
    
    non_scaler_columns = [col for col in combined_df.columns if col.startswith('protein_name_')] + ['binds']
    scaler_columns = [col for col in combined_df.columns if col not in non_scaler_columns]
    
    features = combined_df[scaler_columns]
    features = scaler.fit_transform(features)
    features_df = pd.DataFrame(features, columns=scaler_columns)
    features_df = pd.concat([features_df, combined_df[non_scaler_columns].reset_index(drop=True)], axis=1)

    binds_counts = features_df['binds'].value_counts()
    print(f"binds=0 count: {binds_counts.get(0, 0)}")
    print(f"binds=1 count: {binds_counts.get(1, 0)}")
    
    scaler_filename = f"{save_dir}/utils/desc_scaler_{os.path.basename(b0_desc_file)}.joblib"
    joblib.dump(scaler, scaler_filename)
    
    X = features_df.drop(columns=['binds'])
    y = features_df['binds']
    
    model = RandomForestClassifier()
    scores = cross_val_score(model, X, y, cv=5, scoring='accuracy')
    print(f"Cross-validation scores for {scores}")
    print(f"Mean cross-validation score: {scores.mean()}\n")
    
    # 모델 학습 및 저장
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    print(X_train.shape, y_train.shape)
    print(X_test.shape, y_test.shape)
    
    model.fit(X_train, y_train)
    
    model_filename = f"{save_dir}/weights/desc_model_{os.path.basename(b0_desc_file)}.joblib"
    joblib.dump(model, model_filename)
    
    y_pred = model.predict(X_test)
    report = classification_report(y_test, y_pred)
    print(report, '\n')


# 2.FingerPrint Models

In [None]:
df = pd.read_csv(b0_fp_files[0])
print(df.shape)
df.head()

In [None]:
scaler = MinMaxScaler()

for b0_fp_file, b1_fp_file in zip(b0_fp_files, b1_fp_files):
    print(b0_fp_file, b1_fp_file)
    b0_df = pd.read_csv(b0_fp_file)
    b1_df = pd.read_csv(b1_fp_file)
    print(b0_df.shape, b1_df.shape)
    
    combined_df = pd.concat([b0_df, b1_df], ignore_index=True)
    combined_df = combined_df.sample(frac=1).reset_index(drop=True)

    max_len = combined_df['0'].str.len().max()
    split_columns = pd.DataFrame(combined_df['0'].apply(lambda x: list(x)).tolist(), columns=[f'char_{i}' for i in range(max_len)])
    combined_df = pd.concat([split_columns, combined_df.drop(columns=['0'])], axis=1)

    columns_to_exclude = [col for col in combined_df.columns if col.startswith('binds') or col.startswith('protein_') or col.startswith('char_')]
    columns_to_scale = [col for col in combined_df.columns if col not in columns_to_exclude]
    combined_df[columns_to_scale] = scaler.fit_transform(combined_df[columns_to_scale])

    binds_counts = combined_df['binds'].value_counts()
    print(combined_df.shape)
    print(f"binds=0 count: {binds_counts.get(0, 0)}")
    print(f"binds=1 count: {binds_counts.get(1, 0)}")

    print(combined_df.shape)
    X = combined_df.drop(columns=['binds'])
    y = combined_df['binds']
    print(X.shape, y.shape)

    model = RandomForestClassifier(n_estimators=100, random_state=42)
    model.fit(X, y)

    model_filename = f"{save_dir}/weights/fp_model_{os.path.basename(b0_fp_file).split('.')[0]}.joblib"
    joblib.dump(model, model_filename)

    scaler_filename = f"{save_dir}/utils/fp_scaler_{os.path.basename(b0_fp_file).split('.')[0]}.joblib"
    joblib.dump(scaler, scaler_filename)