In [None]:
import os
import warnings
warnings.filterwarnings("ignore")

import duckdb
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from rdkit import Chem
from rdkit.Chem import DataStructs, Descriptors, rdFingerprintGenerator

from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler
from sklearn.metrics import accuracy_score, classification_report

# Data Load

In [None]:
data_dir = "/home/pervinco/Datasets/leash-bio"
save_dir = f"{data_dir}/split_sets"

train_csv = f"{data_dir}/train.csv"
test_csv = f"{data_dir}/test.csv"

train_parquet = f"{data_dir}/train.parquet"
test_parquet = f'{data_dir}/test.parquet'

os.listdir(data_dir)

['test.parquet',
 'train.csv',
 'test.csv',
 'train.parquet',
 'sample_submission.csv']

In [None]:
# data = pd.read_csv(train_csv) ## 전체 데이터가 295246830. 2억 9천 5백 24만 6830개.

In [None]:
con = duckdb.connect()
data = con.query(f"""(SELECT *
                      FROM parquet_scan('{train_parquet}')
                      WHERE binds = 0
                      ORDER BY random()
                      LIMIT 100000)
                      UNION ALL
                      (SELECT *
                      FROM parquet_scan('{train_parquet}')
                      WHERE binds = 1
                      ORDER BY random()
                      LIMIT 100000)""").df()

con.close()

In [None]:
data.shape

(200000, 7)

In [None]:
data.head()

Unnamed: 0,id,buildingblock1_smiles,buildingblock2_smiles,buildingblock3_smiles,molecule_smiles,protein_name,binds
0,107558024,O=C(N[C@@H](Cc1ccsc1)C(=O)O)OCC1c2ccccc2-c2ccc...,NCc1c(F)cccc1N1CCCC1,COc1ccc(N)cc1Cl.Cl,COc1ccc(Nc2nc(NCc3c(F)cccc3N3CCCC3)nc(N[C@@H](...,sEH,0
1,148303829,O=C(Nc1c(F)cc(Br)cc1C(=O)O)OCC1c2ccccc2-c2ccccc21,CN(C)c1nc(Cl)c(CN)s1.Cl.Cl,NCc1cccs1,CN(C)c1nc(Cl)c(CNc2nc(NCc3cccs3)nc(Nc3c(F)cc(B...,sEH,0
2,108820608,O=C(N[C@@H](Cc1csc2ccccc12)C(=O)O)OCC1c2ccccc2...,Nc1ccc2cn[nH]c2c1,NCC1CCC2CC2C1,O=C(N[Dy])[C@H](Cc1csc2ccccc12)Nc1nc(NCC2CCC3C...,BRD4,0
3,171309983,O=C(Nc1cc(Cl)ncc1C(=O)O)OCC1c2ccccc2-c2ccccc21,COc1ccc(N)c(Cl)c1,CCS(=O)CCN.Cl,CCS(=O)CCNc1nc(Nc2ccc(OC)cc2Cl)nc(Nc2cc(Cl)ncc...,sEH,0
4,167265202,O=C(Nc1cc(C(=O)O)ccc1Cl)OCC1c2ccccc2-c2ccccc21,Cl.NCC1CNC(=O)C1,Cl.NCCN1C(=O)c2ccccc2S1(=O)=O,O=C1CC(CNc2nc(NCCN3C(=O)c4ccccc4S3(=O)=O)nc(Nc...,HSA,0


# EDA

데이터셋은 매우 불균형하다. bind=0이 293656924개, bind=1이 1589906개.

In [None]:
bind_0_count = data[data['binds'] == 0].shape[0]
bind_1_count = data[data['binds'] == 1].shape[0]

print(bind_0_count, bind_1_count)

In [None]:
bb1_set = set(data['buildingblock1_smiles'])
bb2_set = set(data['buildingblock2_smiles'])
bb3_set = set(data['buildingblock3_smiles'])


bb1_bb2_intersection = bb1_set.intersection(bb2_set)
bb1_bb3_intersection = bb1_set.intersection(bb3_set)
bb2_bb3_intersection = bb2_set.intersection(bb3_set)

print(f"Building block 1 & 2 중복 : {'있음' if bb1_bb2_intersection else '없음'}")
print(f"Building block 1 & 3 중복 : {'있음' if bb1_bb3_intersection else '없음'}")
print(f"Building block 2 & 3 중복 : {'있음' if bb2_bb3_intersection else '없음'}")

print(f"Building block 1과 2 사이의 중복된 값: {bb1_bb2_intersection}")
print(f"Building block 1과 3 사이의 중복된 값: {bb1_bb3_intersection}")
print(f"Building block 2와 3 사이의 중복된 값: {bb2_bb3_intersection}")

molecule_smiles는 SMILES 표기법으로 작성된 화학식이자 building block들이 화학적으로 결합한 소분자.  
기계학습을 위해서는 컴퓨터가 이해할 수 있는 형태로 Encoding 해야한다.

In [None]:
mols = [Chem.MolFromSmiles(i) for i in data['molecule_smiles']]
print(mols[0])

In [None]:
def get_fp_with_ao(mol, radius=2, fpSize=2048):
    fpg = rdFingerprintGenerator.GetMorganGenerator(radius=radius,fpSize=fpSize)
    
    ao = rdFingerprintGenerator.AdditionalOutput()
    ao.AllocateAtomCounts() ## 원자의 수
    ao.AllocateAtomToBits() ## 원자가 어떤 비트로 맵핑되는지
    ao.AllocateBitInfoMap()
    
    fp = fpg.GetFingerprint(mol, additionalOutput=ao)
    return fp, ao

In [None]:
fp, ao = get_fp_with_ao(mols[0])
print(fp)
print(ao)

In [None]:
total_bits = fp.GetNumBits()
on_bits = fp.GetNumOnBits()

on_bits_list = list(fp.GetOnBits())

print(f'Total bits: {total_bits}, On Bits: {on_bits}, Sparsity: {on_bits/total_bits}')

하나의 분자식에서도 여러 가지 추가 특성들을 계산할 수 있다.

In [None]:
# SMILES 문자열
smiles = 'CC(=O)OC1=CC=CC=C1C(=O)O'

# SMILES를 분자 객체로 변환
mol = Chem.MolFromSmiles(smiles)

# 분자량 계산
mol_weight = Descriptors.ExactMolWt(mol)
print(f"Molecular Weight: {mol_weight:.2f}")

# 로그P (리피드 용해도) 계산
log_p = Descriptors.MolLogP(mol)
print(f"LogP: {log_p:.2f}")

# 수소 결합 주개 계산
h_bond_donor = Descriptors.NumHDonors(mol)
print(f"Number of Hydrogen Bond Donors: {h_bond_donor}")

# 수소 결합 받개 계산
h_bond_acceptor = Descriptors.NumHAcceptors(mol)
print(f"Number of Hydrogen Bond Acceptors: {h_bond_acceptor}")

# 회전 가능한 결합 수 계산
rotatable_bonds = Descriptors.NumRotatableBonds(mol)
print(f"Number of Rotatable Bonds: {rotatable_bonds}")

# TPSA (위상학적 극성 표면적) 계산
tpsa = Descriptors.TPSA(mol)
print(f"Topological Polar Surface Area (TPSA): {tpsa:.2f}")

# 방향족 고리 수 계산
aromatic_rings = Descriptors.NumAromaticRings(mol)
print(f"Number of Aromatic Rings: {aromatic_rings}")

# 헤테로 원자 수 계산
hetero_atoms = Descriptors.NumHeteroatoms(mol)
print(f"Number of Heteroatoms: {hetero_atoms}")

# Preprocessing

```GetMorganGenerator```을 정의할 때 radius의 설정을 바꿔가면서 학습을 해보는 것도 방법이될 수 있을 것.

In [None]:
data['molecule'] = data['molecule_smiles'].apply(Chem.MolFromSmiles) ## 문자열 화학식을 rdkit 객체로 변환한다.

In [None]:
data.head()

Unnamed: 0,id,buildingblock1_smiles,buildingblock2_smiles,buildingblock3_smiles,molecule_smiles,protein_name,binds,molecule
0,107558024,O=C(N[C@@H](Cc1ccsc1)C(=O)O)OCC1c2ccccc2-c2ccc...,NCc1c(F)cccc1N1CCCC1,COc1ccc(N)cc1Cl.Cl,COc1ccc(Nc2nc(NCc3c(F)cccc3N3CCCC3)nc(N[C@@H](...,sEH,0,<rdkit.Chem.rdchem.Mol object at 0x7f89c79d6430>
1,148303829,O=C(Nc1c(F)cc(Br)cc1C(=O)O)OCC1c2ccccc2-c2ccccc21,CN(C)c1nc(Cl)c(CN)s1.Cl.Cl,NCc1cccs1,CN(C)c1nc(Cl)c(CNc2nc(NCc3cccs3)nc(Nc3c(F)cc(B...,sEH,0,<rdkit.Chem.rdchem.Mol object at 0x7f89c79d6350>
2,108820608,O=C(N[C@@H](Cc1csc2ccccc12)C(=O)O)OCC1c2ccccc2...,Nc1ccc2cn[nH]c2c1,NCC1CCC2CC2C1,O=C(N[Dy])[C@H](Cc1csc2ccccc12)Nc1nc(NCC2CCC3C...,BRD4,0,<rdkit.Chem.rdchem.Mol object at 0x7f8a93899040>
3,171309983,O=C(Nc1cc(Cl)ncc1C(=O)O)OCC1c2ccccc2-c2ccccc21,COc1ccc(N)c(Cl)c1,CCS(=O)CCN.Cl,CCS(=O)CCNc1nc(Nc2ccc(OC)cc2Cl)nc(Nc2cc(Cl)ncc...,sEH,0,<rdkit.Chem.rdchem.Mol object at 0x7f8a938990b0>
4,167265202,O=C(Nc1cc(C(=O)O)ccc1Cl)OCC1c2ccccc2-c2ccccc21,Cl.NCC1CNC(=O)C1,Cl.NCCN1C(=O)c2ccccc2S1(=O)=O,O=C1CC(CNc2nc(NCCN3C(=O)c4ccccc4S3(=O)=O)nc(Nc...,HSA,0,<rdkit.Chem.rdchem.Mol object at 0x7f8a93899120>


In [None]:
fpg = rdFingerprintGenerator.GetMorganGenerator(radius=2, fpSize=2048)

def compute_fingerprint(mol):
    if mol is None:
        return None
    return fpg.GetFingerprint(mol)

data['fingerprints'] = data['molecule'].apply(compute_fingerprint)

In [None]:
data.head()

Unnamed: 0,id,buildingblock1_smiles,buildingblock2_smiles,buildingblock3_smiles,molecule_smiles,protein_name,binds,molecule,fingerprints
0,107558024,O=C(N[C@@H](Cc1ccsc1)C(=O)O)OCC1c2ccccc2-c2ccc...,NCc1c(F)cccc1N1CCCC1,COc1ccc(N)cc1Cl.Cl,COc1ccc(Nc2nc(NCc3c(F)cccc3N3CCCC3)nc(N[C@@H](...,sEH,0,<rdkit.Chem.rdchem.Mol object at 0x7f89c79d6430>,"[0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, ..."
1,148303829,O=C(Nc1c(F)cc(Br)cc1C(=O)O)OCC1c2ccccc2-c2ccccc21,CN(C)c1nc(Cl)c(CN)s1.Cl.Cl,NCc1cccs1,CN(C)c1nc(Cl)c(CNc2nc(NCc3cccs3)nc(Nc3c(F)cc(B...,sEH,0,<rdkit.Chem.rdchem.Mol object at 0x7f89c79d6350>,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, ..."
2,108820608,O=C(N[C@@H](Cc1csc2ccccc12)C(=O)O)OCC1c2ccccc2...,Nc1ccc2cn[nH]c2c1,NCC1CCC2CC2C1,O=C(N[Dy])[C@H](Cc1csc2ccccc12)Nc1nc(NCC2CCC3C...,BRD4,0,<rdkit.Chem.rdchem.Mol object at 0x7f8a93899040>,"[0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
3,171309983,O=C(Nc1cc(Cl)ncc1C(=O)O)OCC1c2ccccc2-c2ccccc21,COc1ccc(N)c(Cl)c1,CCS(=O)CCN.Cl,CCS(=O)CCNc1nc(Nc2ccc(OC)cc2Cl)nc(Nc2cc(Cl)ncc...,sEH,0,<rdkit.Chem.rdchem.Mol object at 0x7f8a938990b0>,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
4,167265202,O=C(Nc1cc(C(=O)O)ccc1Cl)OCC1c2ccccc2-c2ccccc21,Cl.NCC1CNC(=O)C1,Cl.NCCN1C(=O)c2ccccc2S1(=O)=O,O=C1CC(CNc2nc(NCCN3C(=O)c4ccccc4S3(=O)=O)nc(Nc...,HSA,0,<rdkit.Chem.rdchem.Mol object at 0x7f8a93899120>,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."


In [None]:
onehot_encoder = OneHotEncoder(sparse_output=False)
protein_encoded = onehot_encoder.fit_transform(data['protein_name'].values.reshape(-1, 1)).astype(np.uint8)
protein_columns = onehot_encoder.get_feature_names_out(['protein'])
protein_encoded_df = pd.DataFrame(protein_encoded, columns=protein_columns)
data = pd.concat([data, protein_encoded_df], axis=1)

In [None]:
data.to_csv("./train.csv", index=False)

# Training

In [None]:
def convert_to_numpy_array(explicit_bit_vect):
    arr = np.zeros((1,))
    DataStructs.ConvertToNumpyArray(explicit_bit_vect, arr)
    return arr

protein_onehot = onehot_encoder.fit_transform(data['protein_name'].values.reshape(-1, 1))
fingerprints = [convert_to_numpy_array(fp) for fp in data['fingerprints'].tolist()]
X = [np.concatenate([fp, protein]) for fp, protein in zip(fingerprints, protein_onehot.tolist())]
y = data['binds'].tolist()

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [None]:
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

In [None]:
y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
y_pred_proba = model.predict_proba(X_test)[:, 1]

print(f"Validation Accuracy: {accuracy:.4f}")
print(classification_report(y_test, y_pred))
print(y_pred_proba)

Validation Accuracy: 0.8840
              precision    recall  f1-score   support

           0       0.86      0.92      0.89     19993
           1       0.91      0.85      0.88     20007

    accuracy                           0.88     40000
   macro avg       0.89      0.88      0.88     40000
weighted avg       0.89      0.88      0.88     40000

[0.76 0.74 0.59 ... 0.99 1.   0.27]


# Evaluation

In [None]:
def convert_to_numpy_array(explicit_bit_vect):
    arr = np.zeros((1,))
    DataStructs.ConvertToNumpyArray(explicit_bit_vect, arr)
    return arr

In [None]:
chunk_size = 1000
fpg = rdFingerprintGenerator.GetMorganGenerator(radius=2, fpSize=2048)

submission_chunks = []
for chunk in pd.read_csv(test_csv, chunksize=chunk_size):
    chunk['molecule'] = chunk['molecule_smiles'].apply(Chem.MolFromSmiles)
    chunk['fingerprints'] = chunk['molecule'].apply(compute_fingerprint)
    
    protein_onehot_test = onehot_encoder.transform(chunk['protein_name'].values.reshape(-1, 1))
    fingerprints_test = [convert_to_numpy_array(fp) for fp in chunk['fingerprints'].tolist()]
    X_test_final = [np.concatenate([fp, protein]) for fp, protein in zip(fingerprints_test, protein_onehot_test.tolist())]
    y_test_pred = model.predict(X_test_final)

    chunk['binds_pred'] = y_test_pred
    submission_chunks.append(chunk[['id', 'binds_pred']])

submission = pd.concat(submission_chunks, axis=0)
submission.to_csv('./submission.csv', index=False)