In [1]:
import warnings
warnings.filterwarnings(action='ignore')
import os
import argparse
import logging
import json
import rdkit
import rdkit.Chem
from rdkit.Chem import Descriptors
import pandas as pd
from rdkit import Chem
import numpy as np
from rdkit import Chem
from rdkit.Chem import AllChem
from rdkit.Chem import MACCSkeys
from rdkit.Chem.SaltRemover import SaltRemover

from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, f1_score
from sklearn.metrics import roc_auc_score, average_precision_score
from sklearn.model_selection import train_test_split
from sklearn.metrics import cohen_kappa_score
from sklearn.metrics import balanced_accuracy_score

from imblearn.over_sampling import SMOTE
from imblearn.combine import SMOTEENN

from tqdm import tnrange, tqdm_notebook

Using TensorFlow backend.


In [2]:
remover = SaltRemover()
def remove_nonbonded(smiles):
    a=[]
    smiles_split = smiles.split(".")
    for i in range(0,len(smiles_split)):
        a.append(len(smiles_split[i]))
    return smiles_split[a.index(max(a))]

In [3]:

def smiles_to_finger_MACCS(smiles):
    mol_1 = Chem.MolFromSmiles(smiles)
    mol_1.MACCS = MACCSkeys.GenMACCSKeys(mol_1).ToBitString()
    prac = np.array(list(mol_1.MACCS))
    return prac


In [4]:
def smiles_to_finger(smiles,radian=2,nBits=1024):
    
    mol_1 = Chem.MolFromSmiles(smiles)
    mol_1.ecfp4 = AllChem.GetMorganFingerprintAsBitVect(mol_1,radian,nBits)
    prac = np.array(list(mol_1.ecfp4.ToBitString()))
    return prac


In [5]:
def gmean_score(y_true,y_pred):
    tn,fp,fn,tp = confusion_matrix(y_true,y_pred).ravel()
    return ((tp/(tp+fn))*(tn/(tn+fp)))**(1/2)

# Data Import

In [6]:
data = pd.read_csv("Data.csv",header=0)

In [7]:
data

Unnamed: 0,SMILES,n_np
0,Brc1cccc(c1N=C1NCCN1)Br,1
1,Brc1cc(Br)cc(c1N=C1NCCN1)Br,1
2,O=c1c(Br)ccc2n1C[C@@H]1CNC[C@@H]2C1,1
3,Cc1ccc(cc1)C,1
4,Cc1ccccc1C,1
...,...,...
2316,CO[C@H]1CCC[C@H]2C1=C(C=O)N1[C@@H]2C(C1=O)[C@@...,0
2317,CC/C(=C(/c1ccccc1)\c1ccc(cc1)OCCN(C)C)/c1ccccc1,0
2318,COC1(NC(=O)[C@H](c2cscc2)C(=O)O)C(=O)N2[C@H]1S...,0
2319,CN([C@@H]1C(=C(C(=O)N)C(=O)C2([C@H]1C[C@H]1C(=...,0


In [8]:
data.n_np = np.where(data.n_np==0,1,0)
data.n_np.value_counts()

0    1782
1     539
Name: n_np, dtype: int64

In [9]:
b= []
for i in range(0,len(data)):
    try:
        Chem.MolToSmiles(remover.StripMol(Chem.MolFromSmiles(remove_nonbonded(data['SMILES'][i]))))
    except:
        b.append(i)
        print(i)
        pass

In [10]:
b

[]

In [11]:
data=data.drop(data.index[b])
data = data.reset_index(drop=True)

In [12]:
data['CANONICAL_SMILES']= (data['SMILES'].apply(remove_nonbonded).apply(Chem.MolFromSmiles).apply(remover.StripMol).apply(Chem.MolToSmiles))

for i in range(data.shape[0]):
    if len(data.iloc[i,2]) == 0:
        data.iloc[i,2] = data.iloc[i,0]

# Make Fingerprint

In [13]:
fingerprint_2048 = data["CANONICAL_SMILES"].apply(smiles_to_finger,radian=2,nBits=2048)
fingerprint_2048 = pd.DataFrame(fingerprint_2048.to_list(),index=fingerprint_2048.index)

In [14]:
data_2048 = pd.concat([data,fingerprint_2048],axis=1)

In [15]:
acc = []
f_score = []
g_mean = []
auc = []
auprc = []
tpr = []
tnr = []
bal_acc = []
for i in tqdm_notebook(range(5)):
    X_train, X_test, y_train, y_test= train_test_split(data_2048.iloc[:,3:],data_2048["n_np"],
                                                       train_size=0.7, test_size=0.3,random_state=i,
                                                       stratify=data_2048["n_np"])
    rf = RandomForestClassifier(random_state=i)
    rf.fit(X_train,y_train)
    predicted = rf.predict(X_test)
    pred_proba = rf.predict_proba(X_test)[:,1]
    acc.append(accuracy_score(y_test,predicted))
    f_score.append(f1_score(y_test,predicted))
    g_mean.append(gmean_score(y_test,predicted))
    auc.append(roc_auc_score(y_test, pred_proba))
    auprc.append(average_precision_score(y_test,pred_proba))
    tn,fp,fn,tp = confusion_matrix(y_test,predicted).ravel()
    tpr.append(tp/(tp+fn))
    tnr.append(tn/(tn+fp))
    bal_acc.append(balanced_accuracy_score(y_test, predicted))
    print(confusion_matrix(y_test,predicted))





HBox(children=(IntProgress(value=0, max=5), HTML(value='')))

[[526   9]
 [ 45 117]]
[[524  11]
 [ 57 105]]
[[525  10]
 [ 48 114]]
[[527   8]
 [ 53 109]]
[[524  11]
 [ 59 103]]



In [16]:
print("acc")
print(np.round(acc,4))
print(np.round(np.mean(acc),4),np.round(np.std(acc),4))
print("f_score")
print(np.round(f_score,4))
print(np.round(np.mean(f_score),4),np.round(np.std(f_score),4))
print("g_mean")
print(np.round(g_mean,4))
print(np.round(np.mean(g_mean),4),np.round(np.std(g_mean),4))
print("auc")
print(np.round(auc,4))
print(np.round(np.mean(auc),4),np.round(np.std(auc),4))
print("auprc")
print(np.round(auprc,4))
print(np.round(np.mean(auprc),4),np.round(np.std(auprc),4))
print("tpr")
print(np.round(tpr,4))
print(np.round(np.mean(tpr),4),np.round(np.std(tpr),4))
print("tnr")
print(np.round(tnr,4))
print(np.round(np.mean(tnr),4),np.round(np.std(tnr),4))
print("Balanced_accuracy")
print(np.round(bal_acc,4))
print(np.round(np.mean(bal_acc),4),np.round(np.std(bal_acc),4))

acc
[0.9225 0.9024 0.9168 0.9125 0.8996]
0.9108 0.0086
f_score
[0.8125 0.7554 0.7972 0.7814 0.7464]
0.7786 0.0248
g_mean
[0.8427 0.7968 0.831  0.8141 0.7891]
0.8147 0.0201
auc
[0.9533 0.9486 0.937  0.9506 0.9252]
0.9429 0.0105
auprc
[0.8972 0.8706 0.8878 0.8897 0.8493]
0.8789 0.0172
tpr
[0.7222 0.6481 0.7037 0.6728 0.6358]
0.6765 0.0326
tnr
[0.9832 0.9794 0.9813 0.985  0.9794]
0.9817 0.0022
Balanced_accuracy
[0.8527 0.8138 0.8425 0.8289 0.8076]
0.8291 0.0169


# SMOTE

In [17]:
acc = []
f_score = []
g_mean = []
auc = []
auprc = []
tpr = []
tnr = []
bal_acc = []
for i in tqdm_notebook(range(5)):
    X_train, X_test, y_train, y_test= train_test_split(data_2048.iloc[:,3:],data_2048["n_np"],
                                                       train_size=0.7, test_size=0.3,random_state=i,
                                                       stratify=data_2048["n_np"])
    
    sm = SMOTE(random_state=i)
    X_resampled, y_resampled = sm.fit_sample(X_train,y_train)
    rf = RandomForestClassifier(random_state=i)
    rf.fit(X_resampled,y_resampled)
    predicted = rf.predict(X_test)
    pred_proba = rf.predict_proba(X_test)[:,1]
    acc.append(accuracy_score(y_test,predicted))
    f_score.append(f1_score(y_test,predicted))
    g_mean.append(gmean_score(y_test,predicted))
    auc.append(roc_auc_score(y_test, pred_proba))
    auprc.append(average_precision_score(y_test,pred_proba))
    tn,fp,fn,tp = confusion_matrix(y_test,predicted).ravel()
    tpr.append(tp/(tp+fn))
    tnr.append(tn/(tn+fp))
    bal_acc.append(balanced_accuracy_score(y_test, predicted))
    print(confusion_matrix(y_test,predicted))





HBox(children=(IntProgress(value=0, max=5), HTML(value='')))

[[526   9]
 [ 45 117]]
[[523  12]
 [ 49 113]]
[[524  11]
 [ 44 118]]
[[525  10]
 [ 45 117]]
[[523  12]
 [ 49 113]]



In [18]:
print("acc")
print(np.round(acc,4))
print(np.round(np.mean(acc),4),np.round(np.std(acc),4))
print("f_score")
print(np.round(f_score,4))
print(np.round(np.mean(f_score),4),np.round(np.std(f_score),4))
print("g_mean")
print(np.round(g_mean,4))
print(np.round(np.mean(g_mean),4),np.round(np.std(g_mean),4))
print("auc")
print(np.round(auc,4))
print(np.round(np.mean(auc),4),np.round(np.std(auc),4))
print("auprc")
print(np.round(auprc,4))
print(np.round(np.mean(auprc),4),np.round(np.std(auprc),4))
print("tpr")
print(np.round(tpr,4))
print(np.round(np.mean(tpr),4),np.round(np.std(tpr),4))
print("tnr")
print(np.round(tnr,4))
print(np.round(np.mean(tnr),4),np.round(np.std(tnr),4))
print("Balanced_accuracy")
print(np.round(bal_acc,4))
print(np.round(np.mean(bal_acc),4),np.round(np.std(bal_acc),4))

acc
[0.9225 0.9125 0.9211 0.9211 0.9125]
0.9179 0.0045
f_score
[0.8125 0.7875 0.811  0.8097 0.7875]
0.8016 0.0116
g_mean
[0.8427 0.8258 0.8446 0.8419 0.8258]
0.8361 0.0085
auc
[0.9517 0.9508 0.9386 0.9502 0.9306]
0.9444 0.0084
auprc
[0.8821 0.8647 0.8897 0.889  0.8704]
0.8792 0.01
tpr
[0.7222 0.6975 0.7284 0.7222 0.6975]
0.7136 0.0133
tnr
[0.9832 0.9776 0.9794 0.9813 0.9776]
0.9798 0.0022
Balanced_accuracy
[0.8527 0.8376 0.8539 0.8518 0.8376]
0.8467 0.0075


# SMOTE ENN

In [19]:
acc = []
f_score = []
g_mean = []
auc = []
auprc = []
tpr = []
tnr = []
bal_acc = []
for i in tqdm_notebook(range(5)):
    X_train, X_test, y_train, y_test= train_test_split(data_2048.iloc[:,3:],data_2048["n_np"],
                                                       train_size=0.7, test_size=0.3,random_state=i,
                                                       stratify=data_2048["n_np"])
    
    sm = SMOTEENN(random_state=i)
    X_resampled, y_resampled = sm.fit_sample(X_train,y_train)
    rf = RandomForestClassifier(random_state=i)
    rf.fit(X_resampled,y_resampled)
    predicted = rf.predict(X_test)
    pred_proba = rf.predict_proba(X_test)[:,1]
    acc.append(accuracy_score(y_test,predicted))
    f_score.append(f1_score(y_test,predicted))
    g_mean.append(gmean_score(y_test,predicted))
    auc.append(roc_auc_score(y_test, pred_proba))
    auprc.append(average_precision_score(y_test,pred_proba))
    tn,fp,fn,tp = confusion_matrix(y_test,predicted).ravel()
    tpr.append(tp/(tp+fn))
    tnr.append(tn/(tn+fp))
    bal_acc.append(balanced_accuracy_score(y_test, predicted))
    print(confusion_matrix(y_test,predicted))





HBox(children=(IntProgress(value=0, max=5), HTML(value='')))

[[501  34]
 [ 38 124]]
[[489  46]
 [ 31 131]]
[[493  42]
 [ 37 125]]
[[505  30]
 [ 29 133]]
[[497  38]
 [ 35 127]]



In [20]:
print("acc")
print(np.round(acc,4))
print(np.round(np.mean(acc),4),np.round(np.std(acc),4))
print("f_score")
print(np.round(f_score,4))
print(np.round(np.mean(f_score),4),np.round(np.std(f_score),4))
print("g_mean")
print(np.round(g_mean,4))
print(np.round(np.mean(g_mean),4),np.round(np.std(g_mean),4))
print("auc")
print(np.round(auc,4))
print(np.round(np.mean(auc),4),np.round(np.std(auc),4))
print("auprc")
print(np.round(auprc,4))
print(np.round(np.mean(auprc),4),np.round(np.std(auprc),4))
print("tpr")
print(np.round(tpr,4))
print(np.round(np.mean(tpr),4),np.round(np.std(tpr),4))
print("tnr")
print(np.round(tnr,4))
print(np.round(np.mean(tnr),4),np.round(np.std(tnr),4))
print("Balanced_accuracy")
print(np.round(bal_acc,4))
print(np.round(np.mean(bal_acc),4),np.round(np.std(bal_acc),4))

acc
[0.8967 0.8895 0.8867 0.9154 0.8953]
0.8967 0.01
f_score
[0.775  0.7729 0.7599 0.8185 0.7768]
0.7806 0.0198
g_mean
[0.8466 0.8597 0.8432 0.8803 0.8534]
0.8567 0.0131
auc
[0.9445 0.944  0.9159 0.9478 0.9216]
0.9347 0.0132
auprc
[0.8732 0.8451 0.8531 0.8792 0.845 ]
0.8591 0.0144
tpr
[0.7654 0.8086 0.7716 0.821  0.784 ]
0.7901 0.0214
tnr
[0.9364 0.914  0.9215 0.9439 0.929 ]
0.929 0.0106
Balanced_accuracy
[0.8509 0.8613 0.8466 0.8825 0.8565]
0.8595 0.0125


# Make Descriptor

In [21]:
name = [
    'MolWt',
    'HeavyAtomMolWt',
    'ExactMolWt',
    'NumValenceElectrons',
    'NumRadicalElectrons',
    'MaxEStateIndex',
    'MinEStateIndex',
    'MaxAbsEStateIndex',
    'MinAbsEStateIndex',
    'BalabanJ',
    'BertzCT',
    'Chi0',
    'Chi0n',
    'Chi0v',
    'Chi1',
    'Chi1n',
    'Chi1v',
    'Chi2n',
    'Chi2v',
    'Chi3n',
    'Chi3v',
    'Chi4n',
    'Chi4v',
    'EState_VSA1',
    'EState_VSA10',
    'EState_VSA11',
    'EState_VSA2',
    'EState_VSA3',
    'EState_VSA4',
    'EState_VSA5',
    'EState_VSA6',
    'EState_VSA7',
    'EState_VSA8',
    'EState_VSA9',
    'FractionCSP3',
    'HallKierAlpha',
    'HeavyAtomCount',
    'Ipc',
    'Kappa1',
    'Kappa2',
    'Kappa3',
    'LabuteASA',
    'MolLogP',
    'MolMR',
    'NHOHCount',
    'NOCount',
    'NumAliphaticCarbocycles',
    'NumAliphaticHeterocycles',
    'NumAliphaticRings',
    'NumAromaticCarbocycles',
    'NumAromaticHeterocycles',
    'NumAromaticRings',
    'NumHAcceptors',
    'NumHDonors',
    'NumHeteroatoms',
    'NumRotatableBonds',
    'NumSaturatedCarbocycles',
    'NumSaturatedHeterocycles',
    'NumSaturatedRings',
    'PEOE_VSA1',
    'PEOE_VSA10',
    'PEOE_VSA11',
    'PEOE_VSA12',
    'PEOE_VSA13',
    'PEOE_VSA14',
    'PEOE_VSA2',
    'PEOE_VSA3',
    'PEOE_VSA4',
    'PEOE_VSA5',
    'PEOE_VSA6',
    'PEOE_VSA7',
    'PEOE_VSA8',
    'PEOE_VSA9',
    'RingCount',
    'SMR_VSA1',
    'SMR_VSA10',
    'SMR_VSA2',
    'SMR_VSA3',
    'SMR_VSA4',
    'SMR_VSA5',
    'SMR_VSA6',
    'SMR_VSA7',
    'SMR_VSA8',
    'SMR_VSA9',
    'SlogP_VSA1',
    'SlogP_VSA10',
    'SlogP_VSA11',
    'SlogP_VSA12',
    'SlogP_VSA2',
    'SlogP_VSA3',
    'SlogP_VSA4',
    'SlogP_VSA5',
    'SlogP_VSA6',
    'SlogP_VSA7',
    'SlogP_VSA8',
    'SlogP_VSA9',
    'TPSA',
    'VSA_EState1',
    'VSA_EState10',
    'VSA_EState2',
    'VSA_EState3',
    'VSA_EState4',
    'VSA_EState5',
    'VSA_EState6',
    'VSA_EState7',
    'VSA_EState8',
    'VSA_EState9',
    'fr_Al_COO',
    'fr_Al_OH',
    'fr_Al_OH_noTert',
    'fr_ArN',
    'fr_Ar_COO',
    'fr_Ar_N',
    'fr_Ar_NH',
    'fr_Ar_OH',
    'fr_COO',
    'fr_COO2',
    'fr_C_O',
    'fr_C_O_noCOO',
    'fr_C_S',
    'fr_HOCCN',
    'fr_Imine',
    'fr_NH0',
    'fr_NH1',
    'fr_NH2',
    'fr_N_O',
    'fr_Ndealkylation1',
    'fr_Ndealkylation2',
    'fr_Nhpyrrole',
    'fr_SH',
    'fr_aldehyde',
    'fr_alkyl_carbamate',
    'fr_alkyl_halide',
    'fr_allylic_oxid',
    'fr_amide',
    'fr_amidine',
    'fr_aniline',
    'fr_aryl_methyl',
    'fr_azide',
    'fr_azo',
    'fr_barbitur',
    'fr_benzene',
    'fr_benzodiazepine',
    'fr_bicyclic',
    'fr_diazo',
    'fr_dihydropyridine',
    'fr_epoxide',
    'fr_ester',
    'fr_ether',
    'fr_furan',
    'fr_guanido',
    'fr_halogen',
    'fr_hdrzine',
    'fr_hdrzone',
    'fr_imidazole',
    'fr_imide',
    'fr_isocyan',
    'fr_isothiocyan',
    'fr_ketone',
    'fr_ketone_Topliss',
    'fr_lactam',
    'fr_lactone',
    'fr_methoxy',
    'fr_morpholine',
    'fr_nitrile',
    'fr_nitro',
    'fr_nitro_arom',
    'fr_nitro_arom_nonortho',
    'fr_nitroso',
    'fr_oxazole',
    'fr_oxime',
    'fr_para_hydroxylation',
    'fr_phenol',
    'fr_phenol_noOrthoHbond',
    'fr_phos_acid',
    'fr_phos_ester',
    'fr_piperdine',
    'fr_piperzine',
    'fr_priamide',
    'fr_prisulfonamd',
    'fr_pyridine',
    'fr_quatN',
    'fr_sulfide',
    'fr_sulfonamd',
    'fr_sulfone',
    'fr_term_acetylene',
    'fr_tetrazole',
    'fr_thiazole',
    'fr_thiocyan',
    'fr_thiophene',
    'fr_unbrch_alkane',
    'fr_urea',
    'MaxAbsPartialCharge',
    'MaxPartialCharge',
    'MinAbsPartialCharge',
    'MinPartialCharge'
]

function = [
    Descriptors.MolWt,
    Descriptors.HeavyAtomMolWt,
    Descriptors.ExactMolWt,
    Descriptors.NumValenceElectrons,
    Descriptors.NumRadicalElectrons,
    # http://www.rdkit.org/docs/api/rdkit.Chem.Descriptors-module.html
    Descriptors.MaxEStateIndex,
    Descriptors.MinEStateIndex,
    Descriptors.MaxAbsEStateIndex,
    Descriptors.MinAbsEStateIndex,
    Descriptors.BalabanJ,
    Descriptors.BertzCT,
    Descriptors.Chi0,
    Descriptors.Chi0n,
    Descriptors.Chi0v,
    Descriptors.Chi1,
    Descriptors.Chi1n,
    Descriptors.Chi1v,
    Descriptors.Chi2n,
    Descriptors.Chi2v,
    Descriptors.Chi3n,
    Descriptors.Chi3v,
    Descriptors.Chi4n,
    Descriptors.Chi4v,
    Descriptors.EState_VSA1,
    Descriptors.EState_VSA10,
    Descriptors.EState_VSA11,
    Descriptors.EState_VSA2,
    Descriptors.EState_VSA3,
    Descriptors.EState_VSA4,
    Descriptors.EState_VSA5,
    Descriptors.EState_VSA6,
    Descriptors.EState_VSA7,
    Descriptors.EState_VSA8,
    Descriptors.EState_VSA9,
    Descriptors.FractionCSP3,
    Descriptors.HallKierAlpha,
    Descriptors.HeavyAtomCount,
    Descriptors.Ipc,
    Descriptors.Kappa1,
    Descriptors.Kappa2,
    Descriptors.Kappa3,
    Descriptors.LabuteASA,
    Descriptors.MolLogP,
    Descriptors.MolMR,
    Descriptors.NHOHCount,
    Descriptors.NOCount,
    Descriptors.NumAliphaticCarbocycles,
    Descriptors.NumAliphaticHeterocycles,
    Descriptors.NumAliphaticRings,
    Descriptors.NumAromaticCarbocycles,
    Descriptors.NumAromaticHeterocycles,
    Descriptors.NumAromaticRings,
    Descriptors.NumHAcceptors,
    Descriptors.NumHDonors,
    Descriptors.NumHeteroatoms,
    Descriptors.NumRotatableBonds,
    Descriptors.NumSaturatedCarbocycles,
    Descriptors.NumSaturatedHeterocycles,
    Descriptors.NumSaturatedRings,
    Descriptors.PEOE_VSA1,
    Descriptors.PEOE_VSA10,
    Descriptors.PEOE_VSA11,
    Descriptors.PEOE_VSA12,
    Descriptors.PEOE_VSA13,
    Descriptors.PEOE_VSA14,
    Descriptors.PEOE_VSA2,
    Descriptors.PEOE_VSA3,
    Descriptors.PEOE_VSA4,
    Descriptors.PEOE_VSA5,
    Descriptors.PEOE_VSA6,
    Descriptors.PEOE_VSA7,
    Descriptors.PEOE_VSA8,
    Descriptors.PEOE_VSA9,
    Descriptors.RingCount,
    Descriptors.SMR_VSA1,
    Descriptors.SMR_VSA10,
    Descriptors.SMR_VSA2,
    Descriptors.SMR_VSA3,
    Descriptors.SMR_VSA4,
    Descriptors.SMR_VSA5,
    Descriptors.SMR_VSA6,
    Descriptors.SMR_VSA7,
    Descriptors.SMR_VSA8,
    Descriptors.SMR_VSA9,
    Descriptors.SlogP_VSA1,
    Descriptors.SlogP_VSA10,
    Descriptors.SlogP_VSA11,
    Descriptors.SlogP_VSA12,
    Descriptors.SlogP_VSA2,
    Descriptors.SlogP_VSA3,
    Descriptors.SlogP_VSA4,
    Descriptors.SlogP_VSA5,
    Descriptors.SlogP_VSA6,
    Descriptors.SlogP_VSA7,
    Descriptors.SlogP_VSA8,
    Descriptors.SlogP_VSA9,
    Descriptors.TPSA,
    Descriptors.VSA_EState1,
    Descriptors.VSA_EState10,
    Descriptors.VSA_EState2,
    Descriptors.VSA_EState3,
    Descriptors.VSA_EState4,
    Descriptors.VSA_EState5,
    Descriptors.VSA_EState6,
    Descriptors.VSA_EState7,
    Descriptors.VSA_EState8,
    Descriptors.VSA_EState9,
    Descriptors.fr_Al_COO,
    Descriptors.fr_Al_OH,
    Descriptors.fr_Al_OH_noTert,
    Descriptors.fr_ArN,
    Descriptors.fr_Ar_COO,
    Descriptors.fr_Ar_N,
    Descriptors.fr_Ar_NH,
    Descriptors.fr_Ar_OH,
    Descriptors.fr_COO,
    Descriptors.fr_COO2,
    Descriptors.fr_C_O,
    Descriptors.fr_C_O_noCOO,
    Descriptors.fr_C_S,
    Descriptors.fr_HOCCN,
    Descriptors.fr_Imine,
    Descriptors.fr_NH0,
    Descriptors.fr_NH1,
    Descriptors.fr_NH2,
    Descriptors.fr_N_O,
    Descriptors.fr_Ndealkylation1,
    Descriptors.fr_Ndealkylation2,
    Descriptors.fr_Nhpyrrole,
    Descriptors.fr_SH,
    Descriptors.fr_aldehyde,
    Descriptors.fr_alkyl_carbamate,
    Descriptors.fr_alkyl_halide,
    Descriptors.fr_allylic_oxid,
    Descriptors.fr_amide,
    Descriptors.fr_amidine,
    Descriptors.fr_aniline,
    Descriptors.fr_aryl_methyl,
    Descriptors.fr_azide,
    Descriptors.fr_azo,
    Descriptors.fr_barbitur,
    Descriptors.fr_benzene,
    Descriptors.fr_benzodiazepine,
    Descriptors.fr_bicyclic,
    Descriptors.fr_diazo,
    Descriptors.fr_dihydropyridine,
    Descriptors.fr_epoxide,
    Descriptors.fr_ester,
    Descriptors.fr_ether,
    Descriptors.fr_furan,
    Descriptors.fr_guanido,
    Descriptors.fr_halogen,
    Descriptors.fr_hdrzine,
    Descriptors.fr_hdrzone,
    Descriptors.fr_imidazole,
    Descriptors.fr_imide,
    Descriptors.fr_isocyan,
    Descriptors.fr_isothiocyan,
    Descriptors.fr_ketone,
    Descriptors.fr_ketone_Topliss,
    Descriptors.fr_lactam,
    Descriptors.fr_lactone,
    Descriptors.fr_methoxy,
    Descriptors.fr_morpholine,
    Descriptors.fr_nitrile,
    Descriptors.fr_nitro,
    Descriptors.fr_nitro_arom,
    Descriptors.fr_nitro_arom_nonortho,
    Descriptors.fr_nitroso,
    Descriptors.fr_oxazole,
    Descriptors.fr_oxime,
    Descriptors.fr_para_hydroxylation,
    Descriptors.fr_phenol,
    Descriptors.fr_phenol_noOrthoHbond,
    Descriptors.fr_phos_acid,
    Descriptors.fr_phos_ester,
    Descriptors.fr_piperdine,
    Descriptors.fr_piperzine,
    Descriptors.fr_priamide,
    Descriptors.fr_prisulfonamd,
    Descriptors.fr_pyridine,
    Descriptors.fr_quatN,
    Descriptors.fr_sulfide,
    Descriptors.fr_sulfonamd,
    Descriptors.fr_sulfone,
    Descriptors.fr_term_acetylene,
    Descriptors.fr_tetrazole,
    Descriptors.fr_thiazole,
    Descriptors.fr_thiocyan,
    Descriptors.fr_thiophene,
    Descriptors.fr_unbrch_alkane,
    Descriptors.fr_urea,
    Descriptors.MaxAbsPartialCharge,
    Descriptors.MaxPartialCharge,
    Descriptors.MinAbsPartialCharge,
    Descriptors.MinPartialCharge
]

In [22]:
data_descriptor = data.copy()
data_descriptor['Mol'] = data_descriptor['CANONICAL_SMILES'].apply(Chem.MolFromSmiles)
for i in range(len(name)):
    #print(i)
    data_descriptor[name[i]] = data_descriptor['Mol'].apply(function[i]) #함수적용 
    
data_descriptor = data_descriptor.drop(['Mol'],axis=1)

In [23]:
data_descriptor

Unnamed: 0,SMILES,n_np,CANONICAL_SMILES,MolWt,HeavyAtomMolWt,ExactMolWt,NumValenceElectrons,NumRadicalElectrons,MaxEStateIndex,MinEStateIndex,...,fr_tetrazole,fr_thiazole,fr_thiocyan,fr_thiophene,fr_unbrch_alkane,fr_urea,MaxAbsPartialCharge,MaxPartialCharge,MinAbsPartialCharge,MinPartialCharge
0,Brc1cccc(c1N=C1NCCN1)Br,0,Brc1cccc(Br)c1N=C1NCCN1,319.000,309.928,316.916321,74,0,4.457824,0.831111,...,0,0,0,0,0,0,0.354321,0.196202,0.196202,-0.354321
1,Brc1cc(Br)cc(c1N=C1NCCN1)Br,0,Brc1cc(Br)c(N=C2NCCN2)c(Br)c1,397.896,389.832,394.826834,80,0,4.478102,0.818262,...,0,0,0,0,0,0,0.354321,0.196202,0.196202,-0.354321
2,O=c1c(Br)ccc2n1C[C@@H]1CNC[C@@H]2C1,0,O=c1c(Br)ccc2n1C[C@@H]1CNC[C@@H]2C1,269.142,256.038,268.021125,80,0,11.940193,0.128796,...,0,0,0,0,0,0,0.315723,0.264563,0.264563,-0.315723
3,Cc1ccc(cc1)C,0,Cc1ccc(C)cc1,106.168,96.088,106.078250,42,0,2.120370,1.329259,...,0,0,0,0,0,0,0.059063,-0.039774,0.039774,-0.059063
4,Cc1ccccc1C,0,Cc1ccccc1C,106.168,96.088,106.078250,42,0,2.120370,1.368056,...,0,0,0,0,0,0,0.061985,-0.039511,0.039511,-0.061985
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2316,CO[C@H]1CCC[C@H]2C1=C(C=O)N1[C@@H]2C(C1=O)[C@@...,1,CO[C@H]1CCC[C@H]2C1=C(C=O)N1C(=O)C([C@H](C)O)[...,265.309,246.157,265.131408,104,0,12.124414,-0.663034,...,0,0,0,0,0,0,0.392506,0.234736,0.234736,-0.392506
2317,CC/C(=C(/c1ccccc1)\c1ccc(cc1)OCCN(C)C)/c1ccccc1,1,CC/C(=C(/c1ccccc1)c1ccc(OCCN(C)C)cc1)c1ccccc1,371.524,342.292,371.224915,144,0,5.870988,0.692414,...,0,0,0,0,0,0,0.492264,0.118903,0.118903,-0.492264
2318,COC1(NC(=O)[C@H](c2cscc2)C(=O)O)C(=O)N2[C@H]1S...,1,COC1(NC(=O)[C@@H](C(=O)O)c2ccsc2)C(=O)N2[C@@H]...,414.461,396.317,414.055543,146,0,12.777484,-1.798669,...,0,0,0,1,0,0,0.480398,0.327404,0.327404,-0.480398
2319,CN([C@@H]1C(=C(C(=O)N)C(=O)C2([C@H]1C[C@H]1C(=...,1,CN(C)[C@@H]1C(O)=C(C(N)=O)C(=O)C2(O)C(O)=C3C(=...,444.440,420.248,444.153266,170,0,13.313695,-2.745034,...,0,0,0,0,0,0,0.509651,0.255451,0.255451,-0.509651


# Run BaseModel

In [24]:
acc = []
f_score = []
g_mean = []
auc = []
auprc = []
tpr = []
tnr = []
bal_acc = []
for i in tqdm_notebook(range(5)):
    X_train, X_test, y_train, y_test= train_test_split(data_descriptor.iloc[:,3:],data_descriptor["n_np"],
                                                       train_size=0.7, test_size=0.3,random_state=i,
                                                       stratify=data_descriptor["n_np"])
    
    rf = RandomForestClassifier(n_estimators=100,random_state=i)
    rf.fit(X_train,y_train)
    predicted = rf.predict(X_test)
    pred_proba = rf.predict_proba(X_test)[:,1]
    acc.append(accuracy_score(y_test,predicted))
    f_score.append(f1_score(y_test,predicted))
    g_mean.append(gmean_score(y_test,predicted))
    auc.append(roc_auc_score(y_test, pred_proba))
    auprc.append(average_precision_score(y_test,pred_proba))
    tn,fp,fn,tp = confusion_matrix(y_test,predicted).ravel()
    tpr.append(tp/(tp+fn))
    tnr.append(tn/(tn+fp))
    bal_acc.append(balanced_accuracy_score(y_test, predicted))
    print(confusion_matrix(y_test,predicted))






HBox(children=(IntProgress(value=0, max=5), HTML(value='')))

[[522  13]
 [ 40 122]]
[[526   9]
 [ 46 116]]
[[526   9]
 [ 51 111]]
[[522  13]
 [ 51 111]]
[[522  13]
 [ 55 107]]



In [25]:
print("acc")
print(np.round(acc,4))
print(np.round(np.mean(acc),4),np.round(np.std(acc),4))
print("f_score")
print(np.round(f_score,4))
print(np.round(np.mean(f_score),4),np.round(np.std(f_score),4))
print("g_mean")
print(np.round(g_mean,4))
print(np.round(np.mean(g_mean),4),np.round(np.std(g_mean),4))
print("auc")
print(np.round(auc,4))
print(np.round(np.mean(auc),4),np.round(np.std(auc),4))
print("auprc")
print(np.round(auprc,4))
print(np.round(np.mean(auprc),4),np.round(np.std(auprc),4))
print("tpr")
print(np.round(tpr,4))
print(np.round(np.mean(tpr),4),np.round(np.std(tpr),4))
print("tnr")
print(np.round(tnr,4))
print(np.round(np.mean(tnr),4),np.round(np.std(tnr),4))
print("Balanced_accuracy")
print(np.round(bal_acc,4))
print(np.round(np.mean(bal_acc),4),np.round(np.std(bal_acc),4))

acc
[0.924  0.9211 0.9139 0.9082 0.9024]
0.9139 0.008
f_score
[0.8215 0.8084 0.7872 0.7762 0.7589]
0.7904 0.0224
g_mean
[0.8572 0.839  0.8208 0.8176 0.8028]
0.8275 0.0188
auc
[0.9633 0.9621 0.945  0.9559 0.9334]
0.9519 0.0113
auprc
[0.909  0.8983 0.8999 0.8881 0.8797]
0.895 0.0101
tpr
[0.7531 0.716  0.6852 0.6852 0.6605]
0.7 0.0319
tnr
[0.9757 0.9832 0.9832 0.9757 0.9757]
0.9787 0.0037
Balanced_accuracy
[0.8644 0.8496 0.8342 0.8304 0.8181]
0.8393 0.0161


# SMOTE

In [26]:
acc = []
f_score = []
g_mean = []
auc = []
auprc = []
tpr = []
tnr = []
bal_acc = []
for i in tqdm_notebook(range(5)):
    X_train, X_test, y_train, y_test= train_test_split(data_descriptor.iloc[:,3:],data_descriptor["n_np"],
                                                       train_size=0.7, test_size=0.3,random_state=i,
                                                       stratify=data_descriptor["n_np"])
    
    sm = SMOTE(random_state=i)
    X_resampled, y_resampled = sm.fit_sample(X_train,y_train)
    rf = RandomForestClassifier(random_state=i)
    rf.fit(X_resampled,y_resampled)
    predicted = rf.predict(X_test)
    pred_proba = rf.predict_proba(X_test)[:,1]
    acc.append(accuracy_score(y_test,predicted))
    f_score.append(f1_score(y_test,predicted))
    g_mean.append(gmean_score(y_test,predicted))
    auc.append(roc_auc_score(y_test, pred_proba))
    auprc.append(average_precision_score(y_test,pred_proba))
    tn,fp,fn,tp = confusion_matrix(y_test,predicted).ravel()
    tpr.append(tp/(tp+fn))
    tnr.append(tn/(tn+fp))
    bal_acc.append(balanced_accuracy_score(y_test, predicted))
    print(confusion_matrix(y_test,predicted))





HBox(children=(IntProgress(value=0, max=5), HTML(value='')))

[[516  19]
 [ 30 132]]
[[518  17]
 [ 38 124]]
[[521  14]
 [ 42 120]]
[[513  22]
 [ 37 125]]
[[513  22]
 [ 45 117]]



In [27]:
print("acc")
print(np.round(acc,4))
print(np.round(np.mean(acc),4),np.round(np.std(acc),4))
print("f_score")
print(np.round(f_score,4))
print(np.round(np.mean(f_score),4),np.round(np.std(f_score),4))
print("g_mean")
print(np.round(g_mean,4))
print(np.round(np.mean(g_mean),4),np.round(np.std(g_mean),4))
print("auc")
print(np.round(auc,4))
print(np.round(np.mean(auc),4),np.round(np.std(auc),4))
print("auprc")
print(np.round(auprc,4))
print(np.round(np.mean(auprc),4),np.round(np.std(auprc),4))
print("tpr")
print(np.round(tpr,4))
print(np.round(np.mean(tpr),4),np.round(np.std(tpr),4))
print("tnr")
print(np.round(tnr,4))
print(np.round(np.mean(tnr),4),np.round(np.std(tnr),4))
print("Balanced_accuracy")
print(np.round(bal_acc,4))
print(np.round(np.mean(bal_acc),4),np.round(np.std(bal_acc),4))

acc
[0.9297 0.9211 0.9197 0.9154 0.9039]
0.9179 0.0084
f_score
[0.8435 0.8185 0.8108 0.8091 0.7774]
0.8118 0.0212
g_mean
[0.8865 0.8609 0.8493 0.8602 0.8322]
0.8578 0.0177
auc
[0.961  0.9633 0.9466 0.9646 0.9372]
0.9545 0.0108
auprc
[0.9092 0.902  0.9025 0.9032 0.878 ]
0.899 0.0108
tpr
[0.8148 0.7654 0.7407 0.7716 0.7222]
0.763 0.0314
tnr
[0.9645 0.9682 0.9738 0.9589 0.9589]
0.9649 0.0057
Balanced_accuracy
[0.8897 0.8668 0.8573 0.8652 0.8406]
0.8639 0.0159


# SMOTE ENN

In [28]:
acc = []
f_score = []
g_mean = []
auc = []
auprc = []
tpr = []
tnr = []
bal_acc = []
for i in tqdm_notebook(range(5)):
    X_train, X_test, y_train, y_test= train_test_split(data_descriptor.iloc[:,3:],data_descriptor["n_np"],
                                                       train_size=0.7, test_size=0.3,random_state=i,
                                                       stratify=data_descriptor["n_np"])
    
    sm = SMOTEENN(random_state=i)
    X_resampled, y_resampled = sm.fit_sample(X_train,y_train)
    rf = RandomForestClassifier(random_state=i)
    rf.fit(X_resampled,y_resampled)
    pred_proba = rf.predict_proba(X_test)[:,1]
    acc.append(accuracy_score(y_test,predicted))
    f_score.append(f1_score(y_test,predicted))
    g_mean.append(gmean_score(y_test,predicted))
    auc.append(roc_auc_score(y_test, pred_proba))
    auprc.append(average_precision_score(y_test,pred_proba))
    tn,fp,fn,tp = confusion_matrix(y_test,predicted).ravel()
    tpr.append(tp/(tp+fn))
    tnr.append(tn/(tn+fp))
    bal_acc.append(balanced_accuracy_score(y_test, predicted))
    print(confusion_matrix(y_test,predicted))





HBox(children=(IntProgress(value=0, max=5), HTML(value='')))

[[428 107]
 [130  32]]
[[428 107]
 [130  32]]
[[431 104]
 [127  35]]
[[426 109]
 [132  30]]
[[513  22]
 [ 45 117]]



In [29]:
print("acc")
print(np.round(acc,4))
print(np.round(np.mean(acc),4),np.round(np.std(acc),4))
print("f_score")
print(np.round(f_score,4))
print(np.round(np.mean(f_score),4),np.round(np.std(f_score),4))
print("g_mean")
print(np.round(g_mean,4))
print(np.round(np.mean(g_mean),4),np.round(np.std(g_mean),4))
print("auc")
print(np.round(auc,4))
print(np.round(np.mean(auc),4),np.round(np.std(auc),4))
print("auprc")
print(np.round(auprc,4))
print(np.round(np.mean(auprc),4),np.round(np.std(auprc),4))
print("tpr")
print(np.round(tpr,4))
print(np.round(np.mean(tpr),4),np.round(np.std(tpr),4))
print("tnr")
print(np.round(tnr,4))
print(np.round(np.mean(tnr),4),np.round(np.std(tnr),4))
print("Balanced_accuracy")
print(np.round(bal_acc,4))
print(np.round(np.mean(bal_acc),4),np.round(np.std(bal_acc),4))

acc
[0.66   0.66   0.6686 0.6542 0.9039]
0.7093 0.0974
f_score
[0.2126 0.2126 0.2326 0.1993 0.7774]
0.3269 0.2255
g_mean
[0.3975 0.3975 0.4172 0.384  0.8322]
0.4857 0.1736
auc
[0.9477 0.9463 0.9029 0.95   0.9083]
0.9311 0.0209
auprc
[0.8963 0.8651 0.828  0.887  0.8287]
0.861 0.0286
tpr
[0.1975 0.1975 0.216  0.1852 0.7222]
0.3037 0.2095
tnr
[0.8    0.8    0.8056 0.7963 0.9589]
0.8321 0.0634
Balanced_accuracy
[0.4988 0.4988 0.5108 0.4907 0.8406]
0.5679 0.1365
