In [2]:
import pandas as pd
import numpy as np
import sys
sys.path.append("../../src/common")
from pharmacy_common import PharmacyCommon
common = PharmacyCommon()
from tqdm import tqdm

In [3]:
train_test_path = "../../data/train_test_data/XO_train_test_data.xlsx"
train_dataset = pd.read_excel(train_test_path, sheet_name='train_dataset')
test_dataset = pd.read_excel(train_test_path, sheet_name='test_dataset')
validation_dataset = pd.read_excel(train_test_path, sheet_name='validation_dataset')

In [4]:
print(len(train_dataset), len(test_dataset), len(validation_dataset))

337 73 73


In [5]:
train_dataset.head()

Unnamed: 0,CID,SMILES,IC50(nM),Type,substructure
0,145967694,CC1=CC2=C(C=C1)N=C(O2)/C(=N/O)/CC3=CC=CC=C3,17500.0,inactive,7
1,76329670,CC1(C=CC2=CC(=C(C=C2O1)O)C(=O)/C=C/C3=CC(=C(C=...,1800.0,active,16
2,5320686,C1=CC(=CC=C1/C=C/C(=O)OC[C@@H]2[C@H]([C@@H]([C...,100000.0,inactive,6
3,155903284,C1=CC(=CC=C1C2=NC=NN2)NC(=O)C3C(NC(=O)NC3=O)O,1400.0,active,1
4,137648214,CCCCC1=NN2C(=N1)C3=C(NC2=O)NN=C3,529.0,active,9


Fingerprint Encoding

In [6]:
# ECFP4 - 1024bits
X_train_ecfp4_1024 = common.gen_ecfp4_fpts(train_dataset['SMILES'],1024)
X_test_ecfp4_1024 = common.gen_ecfp4_fpts(test_dataset['SMILES'],1024)
X_val_ecfp4_1024 = common.gen_ecfp4_fpts(validation_dataset['SMILES'],1024)
# ECFP4 - 2048bits
X_train_ecfp4_2048 = common.gen_ecfp4_fpts(train_dataset['SMILES'],2048)
X_test_ecfp4_2048 = common.gen_ecfp4_fpts(test_dataset['SMILES'],2048)
X_val_ecfp4_2048 = common.gen_ecfp4_fpts(validation_dataset['SMILES'],2048)
# ECFP6 - 1024bits 
X_train_ecfp6_1024 = common.gen_ecfp6_fpts(train_dataset['SMILES'],1024)
X_test_ecfp6_1024 = common.gen_ecfp6_fpts(test_dataset['SMILES'],1024)
X_val_ecfp6_1024 = common.gen_ecfp6_fpts(validation_dataset['SMILES'],1024)
# ECFP6 - 2048bits
X_train_ecfp6_2048 = common.gen_ecfp6_fpts(train_dataset['SMILES'],2048)
X_test_ecfp6_2048 = common.gen_ecfp6_fpts(test_dataset['SMILES'],2048)
X_val_ecfp6_2048 = common.gen_ecfp6_fpts(validation_dataset['SMILES'],2048)
# MACCS
X_train_maccs = common.gen_maccs_fpts(train_dataset['SMILES'])
X_test_maccs = common.gen_maccs_fpts(test_dataset['SMILES'])
X_val_maccs = common.gen_maccs_fpts(validation_dataset['SMILES'])

Progress: 100%|██████████| 337/337 [00:00<00:00, 1738.16it/s]
Progress: 100%|██████████| 73/73 [00:00<00:00, 1673.02it/s]
Progress: 100%|██████████| 73/73 [00:00<00:00, 1724.72it/s]
Progress:   0%|          | 0/337 [00:00<?, ?it/s]

Progress: 100%|██████████| 337/337 [00:00<00:00, 1226.67it/s]
Progress: 100%|██████████| 73/73 [00:00<00:00, 1211.56it/s]
Progress: 100%|██████████| 73/73 [00:00<00:00, 1225.62it/s]
Progress: 100%|██████████| 337/337 [00:00<00:00, 1720.69it/s]
Progress: 100%|██████████| 73/73 [00:00<00:00, 1684.87it/s]
Progress: 100%|██████████| 73/73 [00:00<00:00, 1670.89it/s]
Progress: 100%|██████████| 337/337 [00:00<00:00, 1186.17it/s]
Progress: 100%|██████████| 73/73 [00:00<00:00, 1219.70it/s]
Progress: 100%|██████████| 73/73 [00:00<00:00, 1219.65it/s]
Progress: 100%|██████████| 337/337 [00:00<00:00, 1279.58it/s]
Progress: 100%|██████████| 73/73 [00:00<00:00, 1346.57it/s]
Progress: 100%|██████████| 73/73 [00:00<00:00, 1318.88it/s]


In [7]:
len(X_test_ecfp4_1024[0]),len(X_test_ecfp4_2048[0]), len(X_test_ecfp6_1024[0]), len(X_test_ecfp6_2048[0]), len(X_test_maccs[0])

(1024, 2048, 1024, 2048, 167)

Label encoding

In [8]:
#y data
y_train = np.array(train_dataset['Type'])
y_test = np.array(test_dataset['Type'])
y_val = np.array(validation_dataset['Type'])

#Original data
print("Original data:")
print(y_train[0:5])
print(y_test[0:5])
print(y_val[0:5])

#One-hot encoder
import sklearn.preprocessing as preprocessing
label_encoder = preprocessing.LabelEncoder()
y_train = label_encoder.fit_transform(y_train)
y_test = label_encoder.fit_transform(y_test)
y_val = label_encoder.fit_transform(y_val)
print("Encoded data:")
print(y_train[0:5])
print(y_test[0:5])
print(y_val[0:5])

Original data:
['inactive' 'active' 'inactive' 'active' 'active']
['active' 'active' 'inactive' 'inactive' 'active']
['inactive' 'active' 'inactive' 'inactive' 'inactive']
Encoded data:
[1 0 1 0 0]
[0 0 1 1 0]
[1 0 1 1 1]


Define hyperparameter

In [None]:
from sklearn.model_selection import ParameterGrid
from xgboost import XGBClassifier
from sklearn.model_selection import cross_val_score

xgb_param_grid = ParameterGrid({
    'max_depth': list(np.arange(2,10,1)),
    'colsample_bytree': [0.5, 0.7, 0.9],
    'learning_rate' : [0.1, 0.01, 0.001],
    'n_estimators': list(np.arange(10, 201, 10)),
    'reg_lambda': [0, 0.001, 0.01, 0.1, 1], # L2 regularization 
    'min_child_weight':[3, 5, 7]
})

In [None]:
from sklearn.model_selection import KFold

cv = KFold(n_splits=10, random_state=42, shuffle=True)

def train_and_eval(param, fingerprint):
    if fingerprint == "ECFP4-1024":
        X_train, X_val, X_test = X_train_ecfp4_1024, X_val_ecfp4_1024, X_test_ecfp4_1024
    elif fingerprint == "MACCS":
        X_train, X_val, X_test = X_train_maccs, X_val_maccs, X_test_maccs
    elif fingerprint == "ECFP4-2048":
        X_train, X_val, X_test = X_train_ecfp4_2048, X_val_ecfp4_2048, X_test_ecfp4_2048
    elif fingerprint == "ECFP6-1024":
        X_train, X_val, X_test = X_train_ecfp6_1024, X_val_ecfp6_1024, X_test_ecfp6_1024    
    elif fingerprint == "ECFP6-2048":
        X_train, X_val, X_test = X_train_ecfp6_2048, X_val_ecfp6_2048, X_test_ecfp6_2048
    else : 
        print("Fingerprint invalid!")
    
    model = XGBClassifier(objective='binary:logistic', tree_method="hist", n_estimators=param['n_estimators'],
                        colsample_bytree=param['colsample_bytree'], learning_rate= param['learning_rate'],
                        max_depth= param["max_depth"], reg_lambda= param['reg_lambda'], min_child_weight= param['min_child_weight'],
                        random_state =42)
    model.fit(X_train, y_train)

    X_train_val = np.concatenate((X_train, X_val), axis=0)
    y_train_val = np.concatenate((y_train, y_val), axis=0)
    train_acc = model.score(X_train,y_train)
    val_acc = model.score(X_val,y_val)
    test_acc = model.score(X_test,y_test)
    cv_scores = cross_val_score(model, X_train_val, y_train_val, cv=cv, scoring='accuracy')
    cv_acc = np.mean(cv_scores)
    return train_acc, val_acc, test_acc, cv_acc

In [11]:
def do_grid_search(model_name, overfit_level, max_val_test_diff, cv_thres, fingerprint):

    param_grid = xgb_param_grid
    grid_results = {
        'Model': model_name,
        'Fingerprint': fingerprint,
        '10_fold cross validation': [],
        'Train_acc': [],
        'Val_acc': [],
        'Test_acc': [],
        'Train_val_diff': [],
        'Train_test_diff': [],
        'Test_val_diff': [],
        'Param': []
    }
    best_score = -np.inf  # Initialize best score
    pbar = tqdm(param_grid, desc="Training XGBoosting models")

    for param in pbar:
        train_acc, val_acc, test_acc, cv_acc = train_and_eval(param, fingerprint)
        
        train_val_diff = np.abs(train_acc - val_acc)
        train_test_diff = np.abs(train_acc - test_acc)
        test_val_diff = np.abs(test_acc - val_acc)  
        test_cv_diff = np.abs(test_acc - cv_acc)  

        if (cv_acc >= cv_thres and 
            test_cv_diff < overfit_level and 
            train_val_diff <= overfit_level and 
            train_test_diff <= overfit_level and 
            test_val_diff <= max_val_test_diff):
            if best_score < cv_acc:
                best_score = cv_acc
                pbar.set_description(f"[+] Best score: {cv_acc} Best parameter: {param}")
                print(end="\n")
            
            grid_results["10_fold cross validation"].append(cv_acc)
            grid_results["Train_acc"].append(train_acc)
            grid_results["Val_acc"].append(val_acc)
            grid_results["Test_acc"].append(test_acc)
            grid_results["Train_val_diff"].append(train_val_diff)
            grid_results["Train_test_diff"].append(train_test_diff)
            grid_results["Test_val_diff"].append(test_val_diff)
            grid_results["Param"].append(param)

    return grid_results  

In [19]:
overfit_level = 0.05
val_test_diff=0.03
cv_thres = 0.8

model_name = 'XGB_MACCS'
fingerprint = 'MACCS'
results = do_grid_search(model_name, overfit_level, val_test_diff, cv_thres, fingerprint)
 

[+] Best score: 0.8024390243902438 Best parameter: {'colsample_bytree': 0.5, 'learning_rate': 0.1, 'max_depth': 2, 'min_child_weight': 3, 'n_estimators': 70, 'reg_lambda': 1}:   0%|          | 22/12960 [00:01<16:09, 13.34it/s]




[+] Best score: 0.8048780487804879 Best parameter: {'colsample_bytree': 0.5, 'learning_rate': 0.1, 'max_depth': 2, 'min_child_weight': 3, 'n_estimators': 80, 'reg_lambda': 1}:   0%|          | 26/12960 [00:01<17:38, 12.22it/s]




[+] Best score: 0.8170731707317074 Best parameter: {'colsample_bytree': 0.5, 'learning_rate': 0.1, 'max_depth': 2, 'min_child_weight': 7, 'n_estimators': 100, 'reg_lambda': 0.1}:   1%|          | 151/12960 [00:14<19:32, 10.93it/s]




[+] Best score: 0.8195121951219513 Best parameter: {'colsample_bytree': 0.5, 'learning_rate': 0.1, 'max_depth': 2, 'min_child_weight': 7, 'n_estimators': 110, 'reg_lambda': 0.1}:   1%|          | 153/12960 [00:14<20:34, 10.37it/s]




[+] Best score: 0.8292682926829269 Best parameter: {'colsample_bytree': 0.5, 'learning_rate': 0.1, 'max_depth': 2, 'min_child_weight': 7, 'n_estimators': 150, 'reg_lambda': 0.1}:   1%|▏         | 165/12960 [00:16<25:38,  8.32it/s]




[+] Best score: 0.8292682926829269 Best parameter: {'colsample_bytree': 0.5, 'learning_rate': 0.1, 'max_depth': 2, 'min_child_weight': 7, 'n_estimators': 150, 'reg_lambda': 0.1}: 100%|██████████| 12960/12960 [1:15:12<00:00,  2.87it/s]  


In [20]:
results_df = pd.DataFrame(results)
results_df

Unnamed: 0,Model,Fingerprint,10_fold cross validation,Train_acc,Val_acc,Test_acc,Train_val_diff,Train_test_diff,Test_val_diff,Param
0,XGB_MACCS,MACCS,0.802439,0.842730,0.808219,0.808219,0.034511,0.034511,0.000000,"{'colsample_bytree': 0.5, 'learning_rate': 0.1..."
1,XGB_MACCS,MACCS,0.804878,0.854599,0.808219,0.835616,0.046380,0.018983,0.027397,"{'colsample_bytree': 0.5, 'learning_rate': 0.1..."
2,XGB_MACCS,MACCS,0.802439,0.848665,0.808219,0.835616,0.040446,0.013048,0.027397,"{'colsample_bytree': 0.5, 'learning_rate': 0.1..."
3,XGB_MACCS,MACCS,0.802439,0.863501,0.821918,0.835616,0.041584,0.027885,0.013699,"{'colsample_bytree': 0.5, 'learning_rate': 0.1..."
4,XGB_MACCS,MACCS,0.802439,0.842730,0.808219,0.835616,0.034511,0.007114,0.027397,"{'colsample_bytree': 0.5, 'learning_rate': 0.1..."
...,...,...,...,...,...,...,...,...,...,...
226,XGB_MACCS,MACCS,0.807317,0.857567,0.808219,0.808219,0.049348,0.049348,0.000000,"{'colsample_bytree': 0.9, 'learning_rate': 0.0..."
227,XGB_MACCS,MACCS,0.800000,0.842730,0.794521,0.794521,0.048209,0.048209,0.000000,"{'colsample_bytree': 0.9, 'learning_rate': 0.0..."
228,XGB_MACCS,MACCS,0.800000,0.842730,0.794521,0.794521,0.048209,0.048209,0.000000,"{'colsample_bytree': 0.9, 'learning_rate': 0.0..."
229,XGB_MACCS,MACCS,0.800000,0.839763,0.794521,0.794521,0.045242,0.045242,0.000000,"{'colsample_bytree': 0.9, 'learning_rate': 0.0..."


Write to file 

In [21]:
results_df.to_excel(f"../../results/Model_tuning_results/{model_name}_tuning_results.xlsx")