In [1]:
import os
import warnings
warnings.filterwarnings("ignore")

import duckdb
import numpy as np
import pandas as pd
import lightgbm as lgb
import matplotlib.pyplot as plt

from rdkit import Chem
from rdkit.Chem import DataStructs, Descriptors, rdFingerprintGenerator

from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler
from sklearn.metrics import accuracy_score, classification_report

# Data Load

In [2]:
data_dir = "/home/pervinco/Datasets/leash-bio"
save_dir = f"{data_dir}/split_sets"

train_csv = f"{data_dir}/train.csv"
test_csv = f"{data_dir}/test.csv"

train_parquet = f"{data_dir}/train.parquet"
test_parquet = f'{data_dir}/test.parquet'

os.listdir(data_dir)

['test.parquet',
 'train.csv',
 'test.csv',
 'train.parquet',
 'sample_submission.csv']

In [3]:
RADIUS = 2
FPSIZE = 2048

In [4]:
# data = pd.read_csv(train_csv) ## 전체 데이터가 295246830. 2억 9천 5백 24만 6830개.

In [5]:
con = duckdb.connect()
data = con.query(f"""(SELECT *
                      FROM parquet_scan('{train_parquet}')
                      WHERE binds = 0
                      ORDER BY random()
                      LIMIT 100000)
                      UNION ALL
                      (SELECT *
                      FROM parquet_scan('{train_parquet}')
                      WHERE binds = 1
                      ORDER BY random()
                      LIMIT 100000)""").df()

con.close()

In [6]:
data.shape

(200000, 7)

In [7]:
data.head()

Unnamed: 0,id,buildingblock1_smiles,buildingblock2_smiles,buildingblock3_smiles,molecule_smiles,protein_name,binds
0,171770201,O=C(Nc1cc(Cl)ncc1C(=O)O)OCC1c2ccccc2-c2ccccc21,NC1=NC(=O)CS1,N#Cc1ccsc1N,N#Cc1ccsc1Nc1nc(NC2=NC(=O)CS2)nc(Nc2cc(Cl)ncc2...,sEH,0
1,280953257,O=C(O)[C@@H]1CCCN1C(=O)OCC1c2ccccc2-c2ccccc21,Nc1ccc2c(c1)CNCC2,Cl.NC[C@@H]1CCCO1,O=C(N[Dy])[C@@H]1CCCN1c1nc(NC[C@@H]2CCCO2)nc(N...,sEH,0
2,235300379,O=C(O)C1CN(C(=O)OCC2c3ccccc3-c3ccccc32)C1,Nc1ncc([N+](=O)[O-])s1,COc1cccc(N)n1,COc1cccc(Nc2nc(Nc3ncc([N+](=O)[O-])s3)nc(N3CC(...,sEH,0
3,230106112,O=C(Nc1ncc(Br)nc1C(=O)O)OCC1c2ccccc2-c2ccccc21,COCc1ccccc1CN,NCc1nc2ccccc2s1,COCc1ccccc1CNc1nc(NCc2nc3ccccc3s2)nc(Nc2ncc(Br...,HSA,0
4,53724801,Cc1ccc(C(=O)O)c(NC(=O)OCC2c3ccccc3-c3ccccc32)c1C,Cc1cc(C#N)c(N)s1,Cn1nnc(N)n1,Cc1cc(C#N)c(Nc2nc(Nc3nnn(C)n3)nc(Nc3c(C(=O)N[D...,BRD4,0


# Preprocessing

```GetMorganGenerator```을 정의할 때 radius의 설정을 바꿔가면서 학습을 해보는 것도 방법이될 수 있을 것.

In [8]:
data['molecule'] = data['molecule_smiles'].apply(Chem.MolFromSmiles) ## 문자열 화학식을 rdkit 객체로 변환한다.

In [9]:
data.head()

Unnamed: 0,id,buildingblock1_smiles,buildingblock2_smiles,buildingblock3_smiles,molecule_smiles,protein_name,binds,molecule
0,171770201,O=C(Nc1cc(Cl)ncc1C(=O)O)OCC1c2ccccc2-c2ccccc21,NC1=NC(=O)CS1,N#Cc1ccsc1N,N#Cc1ccsc1Nc1nc(NC2=NC(=O)CS2)nc(Nc2cc(Cl)ncc2...,sEH,0,<rdkit.Chem.rdchem.Mol object at 0x7f0062d1ccf0>
1,280953257,O=C(O)[C@@H]1CCCN1C(=O)OCC1c2ccccc2-c2ccccc21,Nc1ccc2c(c1)CNCC2,Cl.NC[C@@H]1CCCO1,O=C(N[Dy])[C@@H]1CCCN1c1nc(NC[C@@H]2CCCO2)nc(N...,sEH,0,<rdkit.Chem.rdchem.Mol object at 0x7f0062d1ceb0>
2,235300379,O=C(O)C1CN(C(=O)OCC2c3ccccc3-c3ccccc32)C1,Nc1ncc([N+](=O)[O-])s1,COc1cccc(N)n1,COc1cccc(Nc2nc(Nc3ncc([N+](=O)[O-])s3)nc(N3CC(...,sEH,0,<rdkit.Chem.rdchem.Mol object at 0x7f0062d1ca50>
3,230106112,O=C(Nc1ncc(Br)nc1C(=O)O)OCC1c2ccccc2-c2ccccc21,COCc1ccccc1CN,NCc1nc2ccccc2s1,COCc1ccccc1CNc1nc(NCc2nc3ccccc3s2)nc(Nc2ncc(Br...,HSA,0,<rdkit.Chem.rdchem.Mol object at 0x7f0062d1cac0>
4,53724801,Cc1ccc(C(=O)O)c(NC(=O)OCC2c3ccccc3-c3ccccc32)c1C,Cc1cc(C#N)c(N)s1,Cn1nnc(N)n1,Cc1cc(C#N)c(Nc2nc(Nc3nnn(C)n3)nc(Nc3c(C(=O)N[D...,BRD4,0,<rdkit.Chem.rdchem.Mol object at 0x7f0062d1cb30>


In [10]:
fpg = rdFingerprintGenerator.GetMorganGenerator(radius=RADIUS, fpSize=FPSIZE)

def compute_fingerprint(mol):
    if mol is None:
        return None
    return fpg.GetFingerprint(mol)

data['fingerprints'] = data['molecule'].apply(compute_fingerprint)

In [11]:
data.head()

Unnamed: 0,id,buildingblock1_smiles,buildingblock2_smiles,buildingblock3_smiles,molecule_smiles,protein_name,binds,molecule,fingerprints
0,171770201,O=C(Nc1cc(Cl)ncc1C(=O)O)OCC1c2ccccc2-c2ccccc21,NC1=NC(=O)CS1,N#Cc1ccsc1N,N#Cc1ccsc1Nc1nc(NC2=NC(=O)CS2)nc(Nc2cc(Cl)ncc2...,sEH,0,<rdkit.Chem.rdchem.Mol object at 0x7f0062d1ccf0>,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, ..."
1,280953257,O=C(O)[C@@H]1CCCN1C(=O)OCC1c2ccccc2-c2ccccc21,Nc1ccc2c(c1)CNCC2,Cl.NC[C@@H]1CCCO1,O=C(N[Dy])[C@@H]1CCCN1c1nc(NC[C@@H]2CCCO2)nc(N...,sEH,0,<rdkit.Chem.rdchem.Mol object at 0x7f0062d1ceb0>,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
2,235300379,O=C(O)C1CN(C(=O)OCC2c3ccccc3-c3ccccc32)C1,Nc1ncc([N+](=O)[O-])s1,COc1cccc(N)n1,COc1cccc(Nc2nc(Nc3ncc([N+](=O)[O-])s3)nc(N3CC(...,sEH,0,<rdkit.Chem.rdchem.Mol object at 0x7f0062d1ca50>,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
3,230106112,O=C(Nc1ncc(Br)nc1C(=O)O)OCC1c2ccccc2-c2ccccc21,COCc1ccccc1CN,NCc1nc2ccccc2s1,COCc1ccccc1CNc1nc(NCc2nc3ccccc3s2)nc(Nc2ncc(Br...,HSA,0,<rdkit.Chem.rdchem.Mol object at 0x7f0062d1cac0>,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
4,53724801,Cc1ccc(C(=O)O)c(NC(=O)OCC2c3ccccc3-c3ccccc32)c1C,Cc1cc(C#N)c(N)s1,Cn1nnc(N)n1,Cc1cc(C#N)c(Nc2nc(Nc3nnn(C)n3)nc(Nc3c(C(=O)N[D...,BRD4,0,<rdkit.Chem.rdchem.Mol object at 0x7f0062d1cb30>,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."


In [12]:
onehot_encoder = OneHotEncoder(sparse_output=False)
protein_encoded = onehot_encoder.fit_transform(data['protein_name'].values.reshape(-1, 1)).astype(np.uint8)
protein_columns = onehot_encoder.get_feature_names_out(['protein'])
protein_encoded_df = pd.DataFrame(protein_encoded, columns=protein_columns)
data = pd.concat([data, protein_encoded_df], axis=1)

In [13]:
data.to_csv("./train.csv", index=False)

# Training

In [14]:
def convert_to_numpy_array(explicit_bit_vect):
    arr = np.zeros((1,))
    DataStructs.ConvertToNumpyArray(explicit_bit_vect, arr)
    return arr

protein_onehot = onehot_encoder.fit_transform(data['protein_name'].values.reshape(-1, 1))
fingerprints = [convert_to_numpy_array(fp) for fp in data['fingerprints'].tolist()]
X = [np.concatenate([fp, protein]) for fp, protein in zip(fingerprints, protein_onehot.tolist())]
y = data['binds'].tolist()

In [15]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [16]:
# model = RandomForestClassifier(n_estimators=100, random_state=42)
# model.fit(X_train, y_train)

model = lgb.LGBMClassifier(
    n_estimators=100,
    learning_rate=0.1,
    random_state=42,
    reg_alpha=0.5,   # L1 규제
    reg_lambda=0.5   # L2 규제
)
model.fit(X_train, y_train)

[LightGBM] [Info] Number of positive: 79993, number of negative: 80007
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.078387 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 3706
[LightGBM] [Info] Number of data points in the train set: 160000, number of used features: 1853
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.499956 -> initscore=-0.000175
[LightGBM] [Info] Start training from score -0.000175


In [17]:
y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
y_pred_proba = model.predict_proba(X_test)[:, 1]

print(f"Validation Accuracy: {accuracy:.4f}")
print(classification_report(y_test, y_pred))
print(y_pred_proba)

Validation Accuracy: 0.8998
              precision    recall  f1-score   support

           0       0.88      0.93      0.90     19993
           1       0.92      0.87      0.90     20007

    accuracy                           0.90     40000
   macro avg       0.90      0.90      0.90     40000
weighted avg       0.90      0.90      0.90     40000

[0.99219655 0.12855263 0.97963119 ... 0.98298298 0.93856937 0.05882482]


# Evaluation

In [None]:
def convert_to_numpy_array(explicit_bit_vect):
    arr = np.zeros((1,))
    DataStructs.ConvertToNumpyArray(explicit_bit_vect, arr)
    return arr

In [None]:
chunk_size = 1000
fpg = rdFingerprintGenerator.GetMorganGenerator(radius=RADIUS, fpSize=FPSIZE)

submission_chunks = []
for chunk in pd.read_csv(test_csv, chunksize=chunk_size):
    chunk['molecule'] = chunk['molecule_smiles'].apply(Chem.MolFromSmiles)
    chunk['fingerprints'] = chunk['molecule'].apply(compute_fingerprint)
    
    protein_onehot_test = onehot_encoder.transform(chunk['protein_name'].values.reshape(-1, 1))
    fingerprints_test = [convert_to_numpy_array(fp) for fp in chunk['fingerprints'].tolist()]
    X_test_final = [np.concatenate([fp, protein]) for fp, protein in zip(fingerprints_test, protein_onehot_test.tolist())]
    y_test_pred = model.predict(X_test_final)

    chunk['binds_pred'] = y_test_pred
    submission_chunks.append(chunk[['id', 'binds_pred']])

submission = pd.concat(submission_chunks, axis=0)
submission.to_csv('./submission.csv', index=False)