# 1. Importing modules and functions

In [2]:
from rdkit import Chem, DataStructs
from rdkit.Chem import AllChem, Descriptors
from rdkit.ML.Descriptors import MoleculeDescriptors
import chembl_structure_pipeline
from rdkit.Chem.AllChem import GetMorganFingerprintAsBitVect
import numpy as np
import pandas as pd
from rdkit.Chem import PandasTools
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import train_test_split, StratifiedKFold, GridSearchCV
from sklearn.model_selection import permutation_test_score
from sklearn.model_selection import cross_val_predict
from sklearn import metrics
from sklearn.metrics import matthews_corrcoef
from sklearn.metrics import cohen_kappa_score
from sklearn.metrics import balanced_accuracy_score
import joblib
import pickle
from IPython.display import HTML
import matplotlib.pyplot as plt

# 2.Data entry and curation work set

In [3]:
uploaded_file_ws="datasets/HDAC6_ws.sdf"
supplier_ws = Chem.ForwardSDMolSupplier(uploaded_file_ws,sanitize=False)
failed_mols_ws = []
all_mols_ws =[]
wrong_structure_ws=[]
wrong_smiles_ws=[]
y_tr = []
y_bad_index=[]

for i, m in enumerate(supplier_ws):
    structure = Chem.Mol(m)
    all_mols_ws.append(structure)
    y_tr.append(m.GetIntProp("Active"))
    try:
        Chem.SanitizeMol(structure)
    except:
        failed_mols_ws.append(m)
        wrong_smiles_ws.append(Chem.MolToSmiles(m))
        wrong_structure_ws.append(str(i+1))
        y_bad_index.append(i)
print('Original data: ', len(all_mols_ws), 'molecules')
print('Failed data: ', len(failed_mols_ws), 'molecules')
number_ws =[]
for i in range(len(failed_mols_ws)):
        number_ws.append(str(i+1))
bad_molecules_ws = pd.DataFrame({'No. failed molecule in original set': wrong_structure_ws, 'SMILES of wrong structure: ': wrong_smiles_ws, 'No.': number_ws}, index=None)
bad_molecules_ws = bad_molecules_ws.set_index('No.')
bad_molecules_ws

Original data:  105 molecules
Failed data:  0 molecules


Unnamed: 0_level_0,No. failed molecule in original set,SMILES of wrong structure:
No.,Unnamed: 1_level_1,Unnamed: 2_level_1


deleting activity values for substances with incorrect structure

In [4]:
y_tr[:] = [x for i,x in enumerate(y_tr) if i not in y_bad_index]

In [5]:
len(y_tr)

105

# 3.Standardization SDF file for work set

In [6]:
records_ws = []
for i in range(len(all_mols_ws)):
    record = Chem.MolToMolBlock(all_mols_ws[i])
    records_ws.append(record)
            
mols_ws = []
for i,record in enumerate(records_ws):
    standard_record = chembl_structure_pipeline.standardize_molblock(record)
    m = Chem.MolFromMolBlock(standard_record)
    mols_ws.append(m)
           
moldf_ws = []
for val in mols_ws:
    if val != None:
        moldf_ws.append(val)
print('Kept data: ', len(moldf_ws), 'molecules')

Kept data:  105 molecules


# 4.Data entry and curation test set

In [7]:
uploaded_file_ts="datasets/HDAC6_ts.sdf"
supplier_ts = Chem.ForwardSDMolSupplier(uploaded_file_ts,sanitize=False)
failed_mols_ts = []
all_mols_ts =[]
wrong_structure_ts=[]
wrong_smiles_ts=[]
y_ts = []
y_bad_index=[]
for i, m in enumerate(supplier_ts):
    structure = Chem.Mol(m)
    all_mols_ts.append(structure)
    y_ts.append(m.GetIntProp("Active"))
    try:
        Chem.SanitizeMol(structure)
    except:
        failed_mols_ts.append(m)
        wrong_smiles_ts.append(Chem.MolToSmiles(m))
        wrong_structure_ts.append(str(i+1))
        y_bad_index.append(i)
print('Original data: ', len(all_mols_ts), 'molecules')
print('Failed data: ', len(failed_mols_ts), 'molecules')
number_ts =[]
for i in range(len(failed_mols_ts)):
        number_ts.append(str(i+1))
bad_molecules_ts = pd.DataFrame({'No. failed molecule in original set': wrong_structure_ts, 'SMILES of wrong structure: ': wrong_smiles_ts, 'No.': number_ts}, index=None)
bad_molecules_ts = bad_molecules_ts.set_index('No.')
bad_molecules_ts

Original data:  26 molecules
Failed data:  0 molecules


Unnamed: 0_level_0,No. failed molecule in original set,SMILES of wrong structure:
No.,Unnamed: 1_level_1,Unnamed: 2_level_1


deleting activity values for substances with incorrect structure

In [8]:
y_ts[:] = [x for i,x in enumerate(y_ts) if i not in y_bad_index]

In [9]:
len(y_ts)

26

# 5.Standardization SDF file for test set

In [10]:
records_ts = []
for i in range(len(all_mols_ts)):
    record = Chem.MolToMolBlock(all_mols_ts[i])
    records_ts.append(record)
            
mols_ts = []
for i,record in enumerate(records_ts):
    standard_record = chembl_structure_pipeline.standardize_molblock(record)
    m = Chem.MolFromMolBlock(standard_record)
    mols_ts.append(m)
           
moldf_ts = []
for val in mols_ts:
    if val != None:
        moldf_ts.append(val)
print('Kept data: ', len(moldf_ts), 'molecules')

Kept data:  26 molecules


# 6.Descriptor calculation for work set

In [11]:
fp_tr = [AllChem.GetMorganFingerprintAsBitVect(m, radius=2,nBits=1024,useFeatures=False,useChirality = False) for m in moldf_ws]

In [12]:
def rdkit_numpy_convert(fp_tr):
    output = []
    for f in fp_tr:
        arr = np.zeros((1,))
        DataStructs.ConvertToNumpyArray(f, arr)
        output.append(arr)
    return np.asarray(output)

In [13]:
from numpy import savetxt
x_tr = rdkit_numpy_convert(fp_tr)

In [14]:
savetxt('Models/Morgan_fingerprint/x_tr.csv', x_tr, delimiter=',')

In [15]:
x_tr.shape

(105, 1024)

# 7.Descriptor calculation for test set

In [16]:
fp_ts = [AllChem.GetMorganFingerprintAsBitVect(m, radius=2,nBits=1024,useFeatures=False,useChirality = False) for m in moldf_ts]

In [17]:
def rdkit_numpy_convert(fp_ts):
    output = []
    for f in fp_ts:
        arr = np.zeros((1,))
        DataStructs.ConvertToNumpyArray(f, arr)
        output.append(arr)
    return np.asarray(output)

In [18]:
x_ts = rdkit_numpy_convert(fp_ts)

In [19]:
x_ts.shape

(26, 1024)

# 8.GBM model building and validation 

## 8.1. GBM model building 

In [20]:
seed = 42

In [21]:
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=seed)

In [982]:
param_grid = {
    "loss":["deviance", 'exponential'],   
    "criterion": ["friedman_mse",  "mae"],   
    "n_estimators":[100, 200, 300, 400, 500]
    }
gbm = GridSearchCV(GradientBoostingClassifier(subsample=0.5, max_features=0.5), 
                   param_grid, n_jobs=2, cv=cv, verbose=1)

In [983]:
gbm.fit(x_tr, y_tr)

Fitting 5 folds for each of 20 candidates, totalling 100 fits


GridSearchCV(cv=StratifiedKFold(n_splits=5, random_state=42, shuffle=True),
             estimator=GradientBoostingClassifier(max_features=0.5,
                                                  subsample=0.5),
             n_jobs=2,
             param_grid={'criterion': ['friedman_mse', 'mae'],
                         'loss': ['deviance', 'exponential'],
                         'n_estimators': [100, 200, 300, 400, 500]},
             verbose=1)

In [984]:
gbm.best_params_

{'criterion': 'friedman_mse', 'loss': 'deviance', 'n_estimators': 400}

In [985]:
best_clf_GBM = gbm.best_estimator_

In [23]:
best_clf_GBM = pickle.load(open('Models/Morgan_fingerprint/HDAC6_GBM_11_07_2023.pkl', 'rb'))

## 8.2.  5-fold-cross-validation   GBM model

In [136]:
y_pred_CV_GBM = cross_val_predict(best_clf_GBM, x_tr, y_tr, cv=cv)

In [137]:
confusion_matrix_CV_GBM = metrics.confusion_matrix(y_tr, y_pred_CV_GBM, labels=[0,1])
Kappa = metrics.cohen_kappa_score(y_tr, y_pred_CV_GBM, weights='linear')
TN, FP, FN, TP = confusion_matrix_CV_GBM.ravel()
SE = TP/(TP+FN)
SP = TN/(TN+FP)
BA = (SE + SP)/2
MCC=matthews_corrcoef(y_tr, y_pred_CV_GBM)
print("balanced_accuracy = ", round((BA), 2))
print("SE = ", round((SE), 2))
print("SP = ", round((SP), 2))
print("Kappa = ", round((Kappa), 2))
print("MCC = ", round((MCC), 2))
print(TP)
print(TN)
print(FP)
print(FN)

balanced_accuracy =  0.79
SE =  0.71
SP =  0.87
Kappa =  0.58
MCC =  0.58
27
58
9
11


In [1044]:
pickle.dump(best_clf_GBM, open('Models/Morgan_fingerprint/HDAC6_GBM.pkl', 'wb'))

## 8.3.Y-randomization for  GBM model

In [28]:
permutations = 500
score, permutation_scores, pvalue = permutation_test_score(best_clf_GBM, x_tr, y_tr,
                                                           cv=cv, scoring='balanced_accuracy',
                                                           n_permutations=permutations,
                                                           n_jobs=-1,
                                                           verbose=1,
                                                           random_state=24)
print('True score = ', score.round(2),
      '\nY-randomization = ', np.mean(permutation_scores).round(2),
      '\np-value = ', pvalue.round(4))

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:   16.5s
[Parallel(n_jobs=-1)]: Done 192 tasks      | elapsed:  1.2min
[Parallel(n_jobs=-1)]: Done 442 tasks      | elapsed:  2.7min


True score =  0.79 
Y-randomization =  0.5 
p-value =  0.002


[Parallel(n_jobs=-1)]: Done 500 out of 500 | elapsed:  3.1min finished


In [138]:
max_Y_randomization = round(np.amax(permutation_scores, axis=0), 2) 
max_Y_randomization

0.67

In [139]:
standard_deviation = round(np.std(permutation_scores, axis=0), 2)
standard_deviation

0.06

In [140]:
min_Y_randomization = round(np.min(permutation_scores, axis=0), 2) 
min_Y_randomization

0.35

In [141]:
a = np.greater_equal(permutation_scores, score)
print("Coverage = ", sum(a) / len(a))

Coverage =  0.0


## 8.4. Model GBM: predict for molecules of test set 

In [81]:
y_pred_gbm = best_clf_GBM.predict(x_ts)

In [82]:
y_pred_gbm

array([1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0,
       0, 0, 0, 0])

In [83]:
confusion_matrix_GBM = metrics.confusion_matrix(y_ts, y_pred_gbm, labels=[0,1])
Kappa = metrics.cohen_kappa_score(y_ts, y_pred_gbm, weights='linear')
TN, FP, FN, TP = confusion_matrix_GBM.ravel()
SE = TP/(TP+FN)
SP = TN/(TN+FP)
BA = (SE + SP)/2
MCC=matthews_corrcoef(y_ts, y_pred_gbm)
print("balanced_accuracy = ", round((BA), 2))
print("SE = ", round((SE), 2))
print("SP = ", round((SP), 2))
print("Kappa = ", round((Kappa), 2))
print("MCC = ", round((MCC), 2))
print(TP)
print(TN)
print(FP)
print(FN)

balanced_accuracy =  0.94
SE =  1.0
SP =  0.88
Kappa =  0.84
MCC =  0.85
9
15
2
0


# 9. Estimating applicability domain.

In [33]:
d_ECFP4 = {}
for mol in Chem.SDMolSupplier("datasets/HDAC6_ws.sdf"):
    mg = AllChem.GetMorganFingerprintAsBitVect(mol, 2, useFeatures=True)
    for m in Chem.SDMolSupplier('datasets/HDAC6_ts.sdf'):
        if m is not None:
            mg_ = AllChem.GetMorganFingerprintAsBitVect(m, 2, useFeatures=True)
            d_ECFP4.setdefault(Chem.MolToSmiles(m),[]).append(DataStructs.FingerprintSimilarity(mg, mg_))

In [34]:
df_ECFP4 = pd.DataFrame.from_dict(d_ECFP4)

In [35]:
df_ECFP4.max()

O=C(CCCCCCC(=O)Nc1cccc(-c2cn(-c3ccc(I)cc3)nn2)c1)NO              1.000000
O=C(CCCCCCn1cc(-c2ccc(Nc3c4ccccc4nc4ccccc34)cc2)nn1)NO           1.000000
O=C(CCCCCCC(=O)Nc1cccc(-c2cn(C3CCCCC3)nn2)c1)NO                  0.702128
Cc1[nH]c2ccccc2c1CCNCc1ccc(C=CC(=O)NO)cc1                        0.482759
O=C(CCCCCCC(=O)Nc1ccc(-c2ccccc2)cc1)NO                           0.880000
Nc1ccc(-c2cc(C(=O)NCCCCCCC(=O)NO)no2)cc1                         1.000000
O=C(CCCCCCC(=O)Nc1cccc(-c2cn(Cc3ccc(O)cc3)nn2)c1)NO              0.622642
COc1cc2ccn(CCCCOc3cccc(NC(=O)CCCCCCC(=O)NO)c3)c2c(OC)c1OC        0.788462
COc1cc2ccn(CCOc3cccc(NC(=O)CCCCCCC(=O)NO)c3)c2c(OC)c1OC          0.857143
COc1ccc(Cl)cc1C(=O)NCCc1ccc(C=CC(=O)NO)cc1                       0.952381
CN(c1ccc(OCC(=O)NO)cc1)c1ncnc2ccccc12                            0.833333
COc1ccc(N(C)c2nc(C)nc3ccccc23)cc1OCCCC(=O)NO                     0.977273
CN(c1ccc(OCCCCCC(=O)NO)cc1)c1ncnc2ccccc12                        1.000000
COc1ccc2nc3cc(Cl)ccc3c(Nc3ccc(-c4cn(CC

In [36]:
threshold = 0.4
da_ECFP4 = np.asarray(df_ECFP4)
da_ECFP4 = np.amax(df_ECFP4, axis=0) >= threshold
da_ECFP4

O=C(CCCCCCC(=O)Nc1cccc(-c2cn(-c3ccc(I)cc3)nn2)c1)NO              True
O=C(CCCCCCn1cc(-c2ccc(Nc3c4ccccc4nc4ccccc34)cc2)nn1)NO           True
O=C(CCCCCCC(=O)Nc1cccc(-c2cn(C3CCCCC3)nn2)c1)NO                  True
Cc1[nH]c2ccccc2c1CCNCc1ccc(C=CC(=O)NO)cc1                        True
O=C(CCCCCCC(=O)Nc1ccc(-c2ccccc2)cc1)NO                           True
Nc1ccc(-c2cc(C(=O)NCCCCCCC(=O)NO)no2)cc1                         True
O=C(CCCCCCC(=O)Nc1cccc(-c2cn(Cc3ccc(O)cc3)nn2)c1)NO              True
COc1cc2ccn(CCCCOc3cccc(NC(=O)CCCCCCC(=O)NO)c3)c2c(OC)c1OC        True
COc1cc2ccn(CCOc3cccc(NC(=O)CCCCCCC(=O)NO)c3)c2c(OC)c1OC          True
COc1ccc(Cl)cc1C(=O)NCCc1ccc(C=CC(=O)NO)cc1                       True
CN(c1ccc(OCC(=O)NO)cc1)c1ncnc2ccccc12                            True
COc1ccc(N(C)c2nc(C)nc3ccccc23)cc1OCCCC(=O)NO                     True
CN(c1ccc(OCCCCCC(=O)NO)cc1)c1ncnc2ccccc12                        True
COc1ccc2nc3cc(Cl)ccc3c(Nc3ccc(-c4cn(CCCCCC(=O)NO)nn4)cc3)c2c1    True
CCN(CC)CCNC(=O)c1ccc

In [37]:
print("Coverage = ", sum(da_ECFP4) / len(da_ECFP4))

Coverage =  1.0


In [38]:
print(np.where(da_ECFP4 == 0)[0])

[]


# 10. Inside AD-only for GBM model

In [39]:
out_Ad=list(np.where(da_ECFP4 == 0)[0])

In [40]:
out_Ad

[]

In [41]:
y_pred_gbm_ad=list(y_pred_gbm)

In [42]:
len(y_pred_gbm_ad)

26

In [43]:
y_pred_gbm_ad[:] = [x for i,x in enumerate(y_pred_gbm_ad) if i not in out_Ad]

In [44]:
len(y_pred_gbm_ad)

26

In [45]:
y_ts_ad=list(y_ts)

In [46]:
len(y_ts)

26

In [47]:
y_ts_ad[:] = [x for i,x in enumerate(y_ts_ad) if i not in out_Ad]

In [48]:
len(y_ts_ad)

26

In [49]:
confusion_matrix_ts = metrics.confusion_matrix(y_ts_ad, y_pred_gbm_ad, labels=[0,1])

In [50]:
Kappa = metrics.cohen_kappa_score(y_ts_ad, y_pred_gbm_ad, weights='linear')
TN, FP, FN, TP = confusion_matrix_ts.ravel()
SE = TP/(TP+FN)
SP = TN/(TN+FP)
BA = (SE + SP)/2
MCC=matthews_corrcoef(y_ts_ad, y_pred_gbm_ad)
print("balanced_accuracy = ", round((BA), 2))
print("SE = ", round((SE), 2))
print("SP = ", round((SP), 2))
print("Kappa = ", round((Kappa), 2))
print("MCC = ", round((MCC), 2))

balanced_accuracy =  0.94
SE =  1.0
SP =  0.88
Kappa =  0.84
MCC =  0.85


# 11. Virtual Screening

## 11.1 Data entry and curation molecules for VS

In [51]:
uploaded_file_vs="datasets/HDAC6_vs_from_patents.sdf"
supplier_vs = Chem.ForwardSDMolSupplier(uploaded_file_vs,sanitize=False)
failed_mols_vs = []
all_mols_vs =[]
wrong_structure_vs=[]
wrong_smiles_vs=[]
y_vs = []
y_bad_index=[]
for i, m in enumerate(supplier_vs):
    structure = Chem.Mol(m)
    all_mols_vs.append(structure)
    y_vs.append(m.GetIntProp("HDAC6_class"))
    try:
        Chem.SanitizeMol(structure)
    except:
        failed_mols_vs.append(m)
        wrong_smiles_vs.append(Chem.MolToSmiles(m))
        wrong_structure_vs.append(str(i+1))
        y_bad_index.append(i)
print('Original data: ', len(all_mols_vs), 'molecules')
print('Failed data: ', len(failed_mols_vs), 'molecules')
number_vs =[]
for i in range(len(failed_mols_vs)):
        number_ts.append(str(i+1))
bad_molecules_vs = pd.DataFrame({'No. failed molecule in original set': wrong_structure_vs, 'SMILES of wrong structure: ': wrong_smiles_vs, 'No.': number_vs}, index=None)
bad_molecules_vs = bad_molecules_vs.set_index('No.')
bad_molecules_vs

Original data:  12 molecules
Failed data:  0 molecules


Unnamed: 0_level_0,No. failed molecule in original set,SMILES of wrong structure:
No.,Unnamed: 1_level_1,Unnamed: 2_level_1


In [52]:
y_vs[:] = [x for i,x in enumerate(y_vs) if i not in y_bad_index]
len(y_vs)

12

In [53]:
records_vs = []
for i in range(len(all_mols_vs)):
    record = Chem.MolToMolBlock(all_mols_vs[i])
    records_vs.append(record)
            
mols_vs = []
for i,record in enumerate(records_vs):
    standard_record = chembl_structure_pipeline.standardize_molblock(record)
    m = Chem.MolFromMolBlock(standard_record)
    mols_vs.append(m)
           
moldf_vs = []
for val in mols_vs:
    if val != None:
        moldf_vs.append(val)
print('Kept data: ', len(moldf_vs), 'molecules')

Kept data:  12 molecules


## 11.2 Descriptor calculation for VS

In [54]:
fp_vs = [AllChem.GetMorganFingerprintAsBitVect(m, radius=2,nBits=1024,useFeatures=False,useChirality = False) for m in moldf_vs]

In [55]:
def rdkit_numpy_convert(fp_vs):
    output = []
    for f in fp_vs:
        arr = np.zeros((1,))
        DataStructs.ConvertToNumpyArray(f, arr)
        output.append(arr)
    return np.asarray(output)

In [56]:
x_vs = rdkit_numpy_convert(fp_vs)

In [57]:
x_vs.shape

(12, 1024)

## 11.3 Prediction for vs molecules

In [58]:
y_pred_gbm = best_clf_GBM.predict(x_vs)

In [59]:
y_pred_gbm

array([0, 0, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1])

In [60]:
confusion_matrix_GBM = metrics.confusion_matrix(y_vs, y_pred_gbm, labels=[0,1])
Kappa = metrics.cohen_kappa_score(y_vs, y_pred_gbm, weights='linear')
TN, FP, FN, TP = confusion_matrix_GBM.ravel()
SE = TP/(TP+FN)
SP = TN/(TN+FP)
BA = (SE + SP)/2
MCC=matthews_corrcoef(y_vs, y_pred_gbm)
print("balanced_accuracy = ", round((BA), 2))
print("SE = ", round((SE), 2))
print("SP = ", round((SP), 2))
print("Kappa = ", round((Kappa), 2))
print("MCC = ", round((MCC), 2))

balanced_accuracy =  0.94
SE =  0.89
SP =  1.0
Kappa =  0.8
MCC =  0.82


## Estimating applicability domain

In [61]:
d_ECFP4 = {}
for mol in Chem.SDMolSupplier("datasets/HDAC6_ws.sdf"):
    mg = AllChem.GetMorganFingerprintAsBitVect(mol, 2, useFeatures=True)
    for m in Chem.SDMolSupplier('datasets/HDAC6_vs_from_patents.sdf'):
        if m is not None:
            mg_ = AllChem.GetMorganFingerprintAsBitVect(m, 2, useFeatures=True)
            d_ECFP4.setdefault(Chem.MolToSmiles(m),[]).append(DataStructs.FingerprintSimilarity(mg, mg_))

In [62]:
df_ECFP4 = pd.DataFrame.from_dict(d_ECFP4)

In [63]:
df_ECFP4.max()

COc1cc2ncn(OCCCCCC(=O)NO)c(=O)c2cc1OC                    0.490909
COc1cc2ncn(OCCCCCCC(=O)NO)c(=O)c2cc1OC                   0.490909
O=C(CCCCCOn1cnc2ccc(Br)cc2c1=O)NO                        0.438596
O=C(CCCCCCOn1cnc2ccc(Br)cc2c1=O)NO                       0.438596
COc1ccc(C2Nc3ccccc3C(=O)N2CCCCCC(=O)NO)cc1               0.500000
COc1cccc(C2Nc3ccccc3C(=O)N2CCCCCC(=O)NO)c1               0.488889
COc1cc(C2Nc3ccccc3C(=O)N2CCCCCC(=O)NO)cc(OC)c1OC         0.478261
O=C(CCCCCN1C(=O)c2ccccc2NC1c1ccc(Cl)cc1)NO               0.500000
COc1ccc(C2Nc3ccccc3C(=O)N2CCCCCC(=O)NO)cc1COc1ccccc1F    0.403226
COc1ccc(C2Nc3ccccc3C(=O)N2CCCCCCC(=O)NO)cc1              0.500000
O=C(CCCCCCN1C(=O)c2ccccc2NC1c1ccc(Cl)cc1)NO              0.500000
COc1ccc(C2Nc3ccc(F)cc3C(=O)N2CCCCCCC(=O)NO)cc1           0.451613
dtype: float64

In [64]:
threshold = 0.45
da_ECFP4 = np.asarray(df_ECFP4)
da_ECFP4 = np.amax(df_ECFP4, axis=0) >= threshold
da_ECFP4

COc1cc2ncn(OCCCCCC(=O)NO)c(=O)c2cc1OC                     True
COc1cc2ncn(OCCCCCCC(=O)NO)c(=O)c2cc1OC                    True
O=C(CCCCCOn1cnc2ccc(Br)cc2c1=O)NO                        False
O=C(CCCCCCOn1cnc2ccc(Br)cc2c1=O)NO                       False
COc1ccc(C2Nc3ccccc3C(=O)N2CCCCCC(=O)NO)cc1                True
COc1cccc(C2Nc3ccccc3C(=O)N2CCCCCC(=O)NO)c1                True
COc1cc(C2Nc3ccccc3C(=O)N2CCCCCC(=O)NO)cc(OC)c1OC          True
O=C(CCCCCN1C(=O)c2ccccc2NC1c1ccc(Cl)cc1)NO                True
COc1ccc(C2Nc3ccccc3C(=O)N2CCCCCC(=O)NO)cc1COc1ccccc1F    False
COc1ccc(C2Nc3ccccc3C(=O)N2CCCCCCC(=O)NO)cc1               True
O=C(CCCCCCN1C(=O)c2ccccc2NC1c1ccc(Cl)cc1)NO               True
COc1ccc(C2Nc3ccc(F)cc3C(=O)N2CCCCCCC(=O)NO)cc1            True
dtype: bool

In [65]:
print("Coverage = ", sum(da_ECFP4) / len(da_ECFP4))

Coverage =  0.75


In [66]:
print(np.where(da_ECFP4 == 0)[0])

[2 3 8]


## Inside AD-only for GBM model

In [67]:
out_Ad=list(np.where(da_ECFP4 == 0)[0])

In [68]:
out_Ad

[2, 3, 8]

In [69]:
y_pred_gbm_ad=list(y_pred_gbm)

In [70]:
len(y_pred_gbm_ad)

12

In [71]:
y_pred_gbm_ad[:] = [x for i,x in enumerate(y_pred_gbm_ad) if i not in out_Ad]

In [72]:
len(y_pred_gbm_ad)

9

In [73]:
y_vs_ad=list(y_vs)

In [74]:
len(y_vs)

12

In [75]:
y_vs_ad[:] = [x for i,x in enumerate(y_vs_ad) if i not in out_Ad]
len(y_vs_ad)

9

In [76]:
confusion_matrix_vs = metrics.confusion_matrix(y_vs_ad, y_pred_gbm_ad, labels=[0,1])

In [77]:
Kappa = metrics.cohen_kappa_score(y_vs_ad, y_pred_gbm_ad, weights='linear')
TN, FP, FN, TP = confusion_matrix_vs.ravel()
SE = TP/(TP+FN)
SP = TN/(TN+FP)
BA = (SE + SP)/2
MCC=matthews_corrcoef(y_vs_ad, y_pred_gbm_ad)
print("balanced_accuracy = ", round((BA), 2))
print("SE = ", round((SE), 2))
print("SP = ", round((SP), 2))
print("Kappa = ", round((Kappa), 2))
print("MCC = ", round((MCC), 2))

balanced_accuracy =  1.0
SE =  1.0
SP =  1.0
Kappa =  1.0
MCC =  1.0
