In [13]:
import os
import warnings
warnings.filterwarnings("ignore")

import duckdb
import pandas as pd

from rdkit import Chem
from rdkit.Chem import AllChem
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split

from tqdm import tqdm
from lightgbm import LGBMClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from xgboost import XGBClassifier, plot_importance as plot_importance_xgb
from lightgbm import LGBMClassifier, plot_importance as plot_importance_lgbm

from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_score
from sklearn.metrics import roc_curve, auc, confusion_matrix, accuracy_score, classification_report

import lightgbm as lgb
from lightgbm import LGBMClassifier

In [2]:
data_dir = "/home/pervinco/Datasets/leash-bio"

train_csv = f"{data_dir}/train.csv"
test_csv = f"{data_dir}/test.csv"

train_parquet = f"{data_dir}/train.parquet"
test_parquet = f'{data_dir}/test.parquet'

os.listdir(data_dir)

['test.parquet',
 'train.csv',
 'test.csv',
 'train.parquet',
 'sample_submission.csv']

In [3]:
con = duckdb.connect()

data = con.query(f"""(SELECT * FROM parquet_scan('{train_parquet}') 
                      WHERE binds = 0
                      ORDER BY random()
                      LIMIT 30000)
                      UNION ALL
                      (SELECT * FROM parquet_scan('{train_parquet}')
                      WHERE binds = 1
                      ORDER BY random()
                      LIMIT 30000)""").df()

data.head()

Unnamed: 0,id,buildingblock1_smiles,buildingblock2_smiles,buildingblock3_smiles,molecule_smiles,protein_name,binds
0,142019137,O=C(Nc1c(C(=O)O)cnn1CCO)OCC1c2ccccc2-c2ccccc21,Cc1ccc(N)c(Cl)n1,COC(=O)c1cc(N)ccc1Cl,COC(=O)c1cc(Nc2nc(Nc3ccc(C)nc3Cl)nc(Nc3c(C(=O)...,HSA,0
1,263243554,O=C(O)C[C@H](C/C=C/c1ccccc1)NC(=O)OCC1c2ccccc2...,Cn1cc(-c2ccccc2CN)cn1,NCc1cnc(N2CCOCC2)c(F)c1,Cn1cc(-c2ccccc2CNc2nc(NCc3cnc(N4CCOCC4)c(F)c3)...,HSA,0
2,167309366,O=C(Nc1cc(C(=O)O)ccc1Cl)OCC1c2ccccc2-c2ccccc21,Cl.NCCS(=O)(=O)C1CCOCC1,Nc1nc(=O)[nH]cc1Br,O=C(N[Dy])c1ccc(Cl)c(Nc2nc(NCCS(=O)(=O)C3CCOCC...,sEH,0
3,18909444,CC(OC(C)(C)C)C(NC(=O)OCC1c2ccccc2-c2ccccc21)C(...,Cc1ccc(Cl)c(N)c1,Nc1cc(Br)cn2ccnc12,Cc1ccc(Cl)c(Nc2nc(Nc3cc(Br)cn4ccnc34)nc(NC(C(=...,BRD4,0
4,208457398,O=C(Nc1cccc(Br)c1C(=O)O)OCC1c2ccccc2-c2ccccc21,Cc1cc(N)ccc1O,Nc1ccc2nccn2c1,Cc1cc(Nc2nc(Nc3ccc4nccn4c3)nc(Nc3cccc(Br)c3C(=...,HSA,0


In [4]:
data.shape

(60000, 7)

## Preprocessing

In [5]:
data['molecule'] = data['molecule_smiles'].apply(Chem.MolFromSmiles)

def modl(molecule_data, radius=2, bits=1024):
    if molecule_data is None:
        return None
    return list(AllChem.GetMorganFingerprintAsBitVect(molecule_data, radius, nBits=bits))

data['H1_ecfp'] = data['molecule'].apply(modl)
data.head()

Unnamed: 0,id,buildingblock1_smiles,buildingblock2_smiles,buildingblock3_smiles,molecule_smiles,protein_name,binds,molecule,H1_ecfp
0,142019137,O=C(Nc1c(C(=O)O)cnn1CCO)OCC1c2ccccc2-c2ccccc21,Cc1ccc(N)c(Cl)n1,COC(=O)c1cc(N)ccc1Cl,COC(=O)c1cc(Nc2nc(Nc3ccc(C)nc3Cl)nc(Nc3c(C(=O)...,HSA,0,<rdkit.Chem.rdchem.Mol object at 0x7efe19b36f20>,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
1,263243554,O=C(O)C[C@H](C/C=C/c1ccccc1)NC(=O)OCC1c2ccccc2...,Cn1cc(-c2ccccc2CN)cn1,NCc1cnc(N2CCOCC2)c(F)c1,Cn1cc(-c2ccccc2CNc2nc(NCc3cnc(N4CCOCC4)c(F)c3)...,HSA,0,<rdkit.Chem.rdchem.Mol object at 0x7efe19b36eb0>,"[0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
2,167309366,O=C(Nc1cc(C(=O)O)ccc1Cl)OCC1c2ccccc2-c2ccccc21,Cl.NCCS(=O)(=O)C1CCOCC1,Nc1nc(=O)[nH]cc1Br,O=C(N[Dy])c1ccc(Cl)c(Nc2nc(NCCS(=O)(=O)C3CCOCC...,sEH,0,<rdkit.Chem.rdchem.Mol object at 0x7efe19b36970>,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
3,18909444,CC(OC(C)(C)C)C(NC(=O)OCC1c2ccccc2-c2ccccc21)C(...,Cc1ccc(Cl)c(N)c1,Nc1cc(Br)cn2ccnc12,Cc1ccc(Cl)c(Nc2nc(Nc3cc(Br)cn4ccnc34)nc(NC(C(=...,BRD4,0,<rdkit.Chem.rdchem.Mol object at 0x7efe19b36c80>,"[0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
4,208457398,O=C(Nc1cccc(Br)c1C(=O)O)OCC1c2ccccc2-c2ccccc21,Cc1cc(N)ccc1O,Nc1ccc2nccn2c1,Cc1cc(Nc2nc(Nc3ccc4nccn4c3)nc(Nc3cccc(Br)c3C(=...,HSA,0,<rdkit.Chem.rdchem.Mol object at 0x7efe19b369e0>,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."


In [6]:
onehot_encoder = OneHotEncoder(sparse_output=False)
onehot_encoder_fit = onehot_encoder.fit_transform(data['protein_name'].values.reshape(-1, 1))

onehot_encoder

In [7]:
data.protein_name

0         HSA
1         HSA
2         sEH
3        BRD4
4         HSA
         ... 
59995     HSA
59996    BRD4
59997    BRD4
59998     sEH
59999     sEH
Name: protein_name, Length: 60000, dtype: object

In [8]:
X = [ecfp + protein for ecfp, protein in zip(data['H1_ecfp'].tolist(), onehot_encoder_fit.tolist())]
y = data['binds'].tolist()

In [9]:
for i in range(5):
    print(X[i])
    print(y[i])

[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 

In [10]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [11]:
models = {
    "Logistic Regression": LogisticRegression(max_iter=1000),
    "Naive bayes": GaussianNB(),
    "KNN": KNeighborsClassifier(),
    "Ada Boost": AdaBoostClassifier(),
    "Gradient Boosting Classifier": GradientBoostingClassifier(),
    "Decision Tree Classifier": DecisionTreeClassifier(max_depth=5, min_samples_split=2, random_state=105),
    "XGBoost": XGBClassifier(n_estimators=100, max_depth=250, learning_rate=0.1, subsample=0.8, colsample_bytree=0.8, 
                             objective='multi:softmax', num_class=3, random_state=42, tree_method='gpu_hist'),
    "LGBM": LGBMClassifier(boosting_type='gbdt', bagging_freq=5, verbose=0, device='gpu', num_leaves=31, max_depth=250, 
                           learning_rate=0.1, n_estimators=100)
}

In [12]:
for name, model in tqdm(models.items(), desc="Training models", total=len(models)):    
    model.fit(X_train, y_train)
    score_training = cross_val_score(model, X_train, y_train, cv=10)
    pred_model = model.predict(X_test)
    
    tqdm.write(f"Model: {model.__class__.__name__} has Accuracy {round(score_training.mean(), 2) * 100:.2f}%")
    print()

Training models:  12%|█▎        | 1/8 [01:53<13:12, 113.16s/it]

Model: LogisticRegression has Accuracy 87.00%



Training models:  25%|██▌       | 2/8 [02:17<06:04, 60.69s/it] 

Model: GaussianNB has Accuracy 74.00%



Training models:  38%|███▊      | 3/8 [02:45<03:50, 46.16s/it]

Model: KNeighborsClassifier has Accuracy 80.00%



Training models:  50%|█████     | 4/8 [06:06<07:08, 107.25s/it]

Model: AdaBoostClassifier has Accuracy 79.00%



Training models:  62%|██████▎   | 5/8 [20:19<18:47, 375.95s/it]

Model: GradientBoostingClassifier has Accuracy 84.00%



Training models:  75%|███████▌  | 6/8 [20:44<08:33, 256.56s/it]

Model: DecisionTreeClassifier has Accuracy 76.00%



Training models:  88%|████████▊ | 7/8 [23:47<03:52, 232.58s/it]

Model: XGBClassifier has Accuracy 92.00%







Training models: 100%|██████████| 8/8 [24:14<00:00, 181.87s/it]

Model: LGBMClassifier has Accuracy 90.00%






In [14]:
params = {'boosting_type': 'gbdt',
          'objective': 'binary',
          'metric': 'binary_logloss',
          'device': 'gpu',
          'gpu_platform_id': 0,
          'gpu_device_id': 0,
         }

model_LGBMClassifier = LGBMClassifier(**params)
model_LGBMClassifier.fit(X_train, y_train)

[LightGBM] [Info] Number of positive: 23909, number of negative: 24091
[LightGBM] [Info] This is the GPU trainer!!
[LightGBM] [Info] Total Bins 2016
[LightGBM] [Info] Number of data points in the train set: 48000, number of used features: 1008
[LightGBM] [Info] Using requested OpenCL platform 0 device 0
[LightGBM] [Info] Using GPU Device: NVIDIA GeForce RTX 4090, Vendor: NVIDIA Corporation
[LightGBM] [Info] Compiling OpenCL Kernel with 16 bins...
[LightGBM] [Info] GPU programs have been built
[LightGBM] [Info] Size of histogram bin entry: 8
[LightGBM] [Info] 11 dense feature groups (0.37 MB) transferred to GPU in 0.000488 secs. 1 sparse feature groups
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.498104 -> initscore=-0.007583
[LightGBM] [Info] Start training from score -0.007583


In [15]:
prob_predictions = model_LGBMClassifier.predict_proba(X_test)
positive_probabilities = prob_predictions[:, 1]
df = pd.DataFrame(
    {'Id': range(1, len(X_test) + 1), 
     'binds': positive_probabilities})

df.to_csv('./submission_NGCDFG.csv', index=False)