# 1. Importing modules and functions

In [233]:
from rdkit import Chem, DataStructs
from rdkit.Chem import AllChem, Descriptors
from rdkit.ML.Descriptors import MoleculeDescriptors
from rdkit.Chem.AllChem import GetMorganFingerprintAsBitVect
import chembl_structure_pipeline
import numpy as np
import pandas as pd
from rdkit.Chem import PandasTools
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.model_selection import train_test_split, StratifiedKFold, GridSearchCV
from sklearn.model_selection import permutation_test_score
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import cross_val_predict
from sklearn import metrics
from sklearn.metrics import cohen_kappa_score
from sklearn.metrics import balanced_accuracy_score
from sklearn.metrics import pairwise_distances
import joblib
import pickle
from IPython.display import HTML
import matplotlib.pyplot as plt

# 2.Data entry and curation work set

In [234]:
uploaded_file_ws="datasets/HDAC1 work set.sdf"
supplier_ws = Chem.ForwardSDMolSupplier(uploaded_file_ws,sanitize=False)
failed_mols_ws = []
all_mols_ws =[]
wrong_structure_ws=[]
wrong_smiles_ws=[]
for i, m in enumerate(supplier_ws):
    structure = Chem.Mol(m)
    all_mols_ws.append(structure)
    try:
        Chem.SanitizeMol(structure)
    except:
        failed_mols_ws.append(m)
        wrong_smiles_ws.append(Chem.MolToSmiles(m))
        wrong_structure_ws.append(str(i+1))
print('Original data: ', len(all_mols_ws), 'molecules')
print('Failed data: ', len(failed_mols_ws), 'molecules')
number_ws =[]
for i in range(len(failed_mols_ws)):
        number_ws.append(str(i+1))
bad_molecules_ws = pd.DataFrame({'No. failed molecule in original set': wrong_structure_ws, 'SMILES of wrong structure: ': wrong_smiles_ws, 'No.': number_ws}, index=None)
bad_molecules_ws = bad_molecules_ws.set_index('No.')
bad_molecules_ws

Original data:  169 molecules
Failed data:  0 molecules


Unnamed: 0_level_0,No. failed molecule in original set,SMILES of wrong structure:
No.,Unnamed: 1_level_1,Unnamed: 2_level_1


# 3.Standardization SDF file for work set

In [235]:
records_ws = []
for i in range(len(all_mols_ws)):
    record = Chem.MolToMolBlock(all_mols_ws[i])
    records_ws.append(record)
            
mols_ws = []
for i,record in enumerate(records_ws):
    standard_record = chembl_structure_pipeline.standardize_molblock(record)
    m = Chem.MolFromMolBlock(standard_record)
    mols_ws.append(m)
           
moldf_ws = []
for val in mols_ws:
    if val != None:
        moldf_ws.append(val)
print('Kept data: ', len(moldf_ws), 'molecules')

Kept data:  169 molecules


# 4.Data entry and curation test set

In [236]:
uploaded_file_ts="datasets/HDAC1  test  set.sdf"
supplier_ts = Chem.ForwardSDMolSupplier(uploaded_file_ts,sanitize=False)
failed_mols_ts = []
all_mols_ts =[]
wrong_structure_ts=[]
wrong_smiles_ts=[]
for i, m in enumerate(supplier_ts):
    structure = Chem.Mol(m)
    all_mols_ts.append(structure)
    try:
        Chem.SanitizeMol(structure)
    except:
        failed_mols_ts.append(m)
        wrong_smiles_ts.append(Chem.MolToSmiles(m))
        wrong_structure_ts.append(str(i+1))
print('Original data: ', len(all_mols_ts), 'molecules')
print('Failed data: ', len(failed_mols_ts), 'molecules')
number_ts =[]
for i in range(len(failed_mols_ts)):
        number_ts.append(str(i+1))
bad_molecules_ts = pd.DataFrame({'No. failed molecule in original set': wrong_structure_ts, 'SMILES of wrong structure: ': wrong_smiles_ts, 'No.': number_ts}, index=None)
bad_molecules_ts = bad_molecules_ts.set_index('No.')
bad_molecules_ts

Original data:  42 molecules
Failed data:  0 molecules


Unnamed: 0_level_0,No. failed molecule in original set,SMILES of wrong structure:
No.,Unnamed: 1_level_1,Unnamed: 2_level_1


# 5.Standardization SDF file for test set

In [237]:
records_ts = []
for i in range(len(all_mols_ts)):
    record = Chem.MolToMolBlock(all_mols_ts[i])
    records_ts.append(record)
            
mols_ts = []
for i,record in enumerate(records_ts):
    standard_record = chembl_structure_pipeline.standardize_molblock(record)
    m = Chem.MolFromMolBlock(standard_record)
    mols_ts.append(m)
           
moldf_ts = []
for val in mols_ts:
    if val != None:
        moldf_ts.append(val)
print('Kept data: ', len(moldf_ts), 'molecules')

Kept data:  42 molecules


# 6.Descriptor calculation for work set

In [238]:
fp_tr = [AllChem.GetMorganFingerprintAsBitVect(m, radius=2,nBits=1024,useFeatures=False,useChirality = False) for m in moldf_ws]

In [239]:
def rdkit_numpy_convert(fp_tr):
    output = []
    for f in fp_tr:
        arr = np.zeros((1,))
        DataStructs.ConvertToNumpyArray(f, arr)
        output.append(arr)
    return np.asarray(output)

In [240]:
from numpy import savetxt
x_tr = rdkit_numpy_convert(fp_tr)

In [241]:
savetxt('Models/FP/x_tr.csv', x_tr, delimiter=',')

In [242]:
x_tr.shape

(169, 1024)

# 7.Descriptor calculation for test set

In [243]:
fp_ts = [AllChem.GetMorganFingerprintAsBitVect(m, radius=2,nBits=1024,useFeatures=False,useChirality = False) for m in moldf_ts]

In [244]:
def rdkit_numpy_convert(fp_ts):
    output = []
    for f in fp_ts:
        arr = np.zeros((1,))
        DataStructs.ConvertToNumpyArray(f, arr)
        output.append(arr)
    return np.asarray(output)

In [245]:
x_ts = rdkit_numpy_convert(fp_ts)

In [246]:
x_ts.shape

(42, 1024)

# 8. RF model building and validation   

## 8.1.  RF model building

In [281]:
seed = 42

In [248]:
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=seed)

In [249]:
param_grid = {"max_features": [x_tr.shape[1] // 10, x_tr.shape[1] // 7, x_tr.shape[1] // 5, x_tr.shape[1] // 3],
              "n_estimators": [100, 250, 500, 1000]}

In [250]:
m = GridSearchCV(RandomForestClassifier(), param_grid, n_jobs=2, cv=cv, verbose=1)

In [251]:
m.fit(x_tr, y_tr)

Fitting 5 folds for each of 16 candidates, totalling 80 fits


GridSearchCV(cv=StratifiedKFold(n_splits=5, random_state=42, shuffle=True),
             estimator=RandomForestClassifier(), n_jobs=2,
             param_grid={'max_features': [102, 146, 204, 341],
                         'n_estimators': [100, 250, 500, 1000]},
             verbose=1)

In [252]:
m.best_params_

{'max_features': 102, 'n_estimators': 500}

In [253]:
best_clf_RF = m.best_estimator_

## 8.2. 5-fold-cross-validation   RF model

In [254]:
y_pred_CV_RF = cross_val_predict(best_clf_RF, x_tr, y_tr, cv=cv)

In [256]:
confusion_matrix_RF = metrics.confusion_matrix(y_tr, y_pred_CV_RF, labels=[0,1])
Kappa = metrics.cohen_kappa_score(y_tr, y_pred_CV_RF, weights='linear')
TN, FP, FN, TP = confusion_matrix_RF.ravel()
SE = TP/(TP+FN)
SP = TN/(TN+FP)
BA = (SE + SP)/2
print("balanced_accuracy = ", round((BA), 2))
print("SE = ", round((SE), 2))
print("SP = ", round((SP), 2))
print("Kappa = ", round((Kappa), 2))

balanced_accuracy =  0.87
SE =  0.83
SP =  0.91
Kappa =  0.74


In [257]:
pickle.dump(best_clf_RF, open('Models/FP/HDAC1_RF_ECFP4.pkl', 'wb'))

## 8.3.Y-randomization  RF model

In [262]:
permutations = 500
score, permutation_scores, pvalue = permutation_test_score(best_clf_RF, x_tr, y_tr,
                                                           cv=cv, scoring='balanced_accuracy',
                                                           n_permutations=permutations,
                                                           n_jobs=-1,
                                                           verbose=1,
                                                           random_state=24)
print('True score = ', score.round(2),
      '\nY-randomization = ', np.mean(permutation_scores).round(2),
      '\np-value = ', pvalue.round(4))

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:   45.2s
[Parallel(n_jobs=-1)]: Done 192 tasks      | elapsed:  3.3min
[Parallel(n_jobs=-1)]: Done 442 tasks      | elapsed:  7.6min


True score =  0.87 
Y-randomization =  0.5 
p-value =  0.002


[Parallel(n_jobs=-1)]: Done 500 out of 500 | elapsed:  8.6min finished


In [169]:
max_Y_randomization = round(np.amax(permutation_scores, axis=0), 2) 
max_Y_randomization

0.65

In [170]:
standard_deviation = round(np.std(permutation_scores, axis=0), 3)
standard_deviation

0.045

In [171]:
min_Y_randomization = round(np.min(permutation_scores, axis=0), 2) 
min_Y_randomization

0.37

In [172]:
a = np.greater_equal(permutation_scores, score)
print("Coverage = ", sum(a) / len(a))

Coverage =  0.0


## 8.4. Model RF: predict for molecules of test set

In [258]:
y_pred_rf = best_clf_RF.predict(x_ts)

In [259]:
y_pred_rf

array([1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0])

In [260]:
confusion_matrix_ts = metrics.confusion_matrix(y_ts, y_pred_rf, labels=[0,1])

In [261]:
Kappa = metrics.cohen_kappa_score(y_ts, y_pred_rf, weights='linear')
TN, FP, FN, TP = confusion_matrix_ts.ravel()
SE = TP/(TP+FN)
SP = TN/(TN+FP)
BA = (SE + SP)/2
print("balanced_accuracy = ", round((BA), 2))
print("SE = ", round((SE), 2))
print("SP = ", round((SP), 2))
print("Kappa = ", round((Kappa), 2))

balanced_accuracy =  0.78
SE =  0.76
SP =  0.8
Kappa =  0.56


# 9. Estimating applicability domain. Method -  Euclidian distances,  K=1 (https://doi.org/10.1021/acs.jcim.0c00415)

In [263]:
neighbors_k= pairwise_distances(x_tr, n_jobs=-1)
neighbors_k.sort(0)

In [264]:
df_tr=pd.DataFrame(neighbors_k)
df_tr

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,159,160,161,162,163,164,165,166,167,168
0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
1,4.898979,4.898979,3.000000,3.162278,2.828427,4.690416,2.828427,8.544004,0.000000,2.828427,...,3.605551,5.830952,4.472136,6.082763,5.830952,5.291503,3.316625,2.828427,5.291503,7.483315
2,7.483315,6.557439,3.316625,3.464102,3.162278,5.196152,3.162278,8.660254,4.242641,3.162278,...,3.872983,9.000000,4.472136,6.324555,8.774964,5.385165,4.123106,3.316625,5.477226,7.483315
3,7.483315,6.855655,3.316625,3.741657,3.162278,5.385165,3.162278,8.888194,5.196152,3.162278,...,4.358899,9.000000,4.472136,6.403124,8.774964,6.000000,5.196152,4.000000,5.567764,7.874008
4,7.681146,6.928203,3.316625,4.000000,3.162278,5.567764,3.316625,8.888194,5.196152,3.316625,...,5.000000,9.165151,5.291503,6.403124,8.774964,6.082763,6.000000,5.567764,5.656854,7.874008
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
164,10.295630,9.643651,10.148892,10.392305,10.198039,9.327379,10.198039,10.816654,10.049876,10.295630,...,10.440307,11.045361,9.433981,9.591663,10.723805,9.055385,10.246951,10.148892,9.055385,10.148892
165,10.344080,9.643651,10.246951,10.392305,10.295630,9.380832,10.198039,10.862780,10.099505,10.295630,...,10.440307,11.045361,9.486833,9.643651,10.723805,9.110434,10.295630,10.148892,9.055385,10.148892
166,10.392305,9.848858,10.344080,10.392305,10.295630,9.486833,10.295630,10.954451,10.148892,10.392305,...,10.488088,11.135529,9.539392,9.695360,10.770330,9.219544,10.344080,10.246951,9.165151,10.295630
167,10.440307,9.949874,10.583005,10.630146,10.440307,9.643651,10.535654,10.954451,10.440307,10.535654,...,10.816654,11.180340,9.591663,9.797959,10.770330,9.486833,10.392305,10.392305,9.219544,10.295630


In [265]:
similarity= neighbors_k

In [266]:
Dmean=np.mean(similarity[1,:])

In [267]:
round(Dmean, 2)

3.32

In [268]:
std=np.std(similarity[1,:])

In [269]:
round(std, 2)

1.6

In [271]:
model_AD_limit=Dmean+std*0.5
print(np.round(model_AD_limit, 2))

4.12


In [272]:
neighbors_k_ts= pairwise_distances(x_tr,Y=x_ts, n_jobs=-1)
neighbors_k_ts.sort(0)

In [273]:
x_ts_AD=pd.DataFrame(neighbors_k_ts)
x_ts_AD

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,32,33,34,35,36,37,38,39,40,41
0,3.741657,0.000000,6.403124,3.872983,3.162278,4.795832,0.000000,3.316625,3.741657,3.000000,...,3.872983,5.000000,6.708204,3.162278,3.316625,1.000000,2.645751,2.828427,2.645751,5.385165
1,3.741657,3.162278,6.633250,4.242641,3.316625,5.099020,0.000000,3.741657,4.358899,3.000000,...,4.582576,7.141428,7.141428,3.316625,4.795832,4.898979,3.872983,4.123106,3.872983,5.916080
2,3.741657,3.316625,6.708204,4.242641,3.464102,5.099020,1.414214,3.741657,4.358899,3.162278,...,4.582576,7.483315,7.615773,4.690416,5.656854,4.898979,4.242641,4.795832,4.898979,6.244998
3,3.872983,3.464102,6.708204,4.242641,3.605551,5.099020,2.828427,4.000000,4.358899,3.605551,...,4.795832,7.483315,7.810250,5.385165,6.324555,5.291503,4.582576,5.567764,5.385165,6.324555
4,4.000000,3.464102,6.782330,4.242641,3.741657,5.099020,3.000000,4.582576,4.690416,3.741657,...,4.795832,7.483315,7.874008,5.656854,6.633250,6.164414,6.000000,6.480741,6.633250,6.480741
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
164,10.295630,9.110434,9.746794,10.488088,10.295630,10.583005,9.539392,9.643651,9.949874,9.380832,...,9.380832,10.295630,10.344080,9.486833,10.099505,10.392305,10.148892,10.344080,10.295630,9.327379
165,10.295630,9.165151,9.746794,10.488088,10.392305,10.677078,9.539392,9.695360,9.949874,9.380832,...,9.380832,10.344080,10.392305,9.643651,10.148892,10.488088,10.198039,10.344080,10.392305,9.486833
166,10.295630,9.273618,9.746794,10.488088,10.488088,10.770330,9.591663,9.695360,10.049876,9.486833,...,9.486833,10.344080,10.440307,9.695360,10.246951,10.535654,10.198039,10.392305,10.488088,9.539392
167,10.535654,9.327379,9.848858,10.723805,10.630146,11.000000,9.695360,9.695360,10.049876,9.486833,...,9.486833,10.344080,10.488088,9.797959,10.440307,10.535654,10.344080,10.392305,10.630146,9.643651


In [280]:
similarity_ts= neighbors_k_ts
cpd_AD=similarity_ts[0,:]
cpd_value = np.round(cpd_AD, 3)
print(cpd_value)

[3.742 0.    6.403 3.873 3.162 4.796 0.    3.317 3.742 3.    0.    3.317
 0.    0.    4.583 0.    3.    3.317 0.    1.    0.    4.123 3.162 3.873
 4.796 4.899 3.162 3.317 1.732 1.    0.    1.    3.873 5.    6.708 3.162
 3.317 1.    2.646 2.828 2.646 5.385]


In [275]:
cpd_AD = np.where(cpd_value <= model_AD_limit, True, False)
print(cpd_AD)

[ True  True False  True  True False  True  True  True  True  True  True
  True  True False  True  True  True  True  True  True False  True  True
 False False  True  True  True  True  True  True  True False False  True
  True  True  True  True  True False]


In [276]:
print("Coverage = ", sum(cpd_AD) / len(cpd_AD))

Coverage =  0.7857142857142857


In [277]:
print("Indices of substances included in AD = ", np.where(cpd_AD != 0)[0])

Indices of substances included in AD =  [ 0  1  3  4  6  7  8  9 10 11 12 13 15 16 17 18 19 20 22 23 26 27 28 29
 30 31 32 35 36 37 38 39 40]


# 10.GBM model building and validation 

## 10.1. GBM model building 

In [282]:
param_grid = {"n_estimators": [100, 200, 300, 400, 500]}
gbm = GridSearchCV(GradientBoostingClassifier(subsample=0.5, max_features=0.5), 
                   param_grid, n_jobs=2, cv=cv, verbose=1)

In [283]:
gbm.fit(x_tr, y_tr)

Fitting 5 folds for each of 5 candidates, totalling 25 fits


GridSearchCV(cv=StratifiedKFold(n_splits=5, random_state=42, shuffle=True),
             estimator=GradientBoostingClassifier(max_features=0.5,
                                                  subsample=0.5),
             n_jobs=2, param_grid={'n_estimators': [100, 200, 300, 400, 500]},
             verbose=1)

In [284]:
gbm.best_params_

{'n_estimators': 300}

In [285]:
best_clf_GBM = gbm.best_estimator_

## 10.2.  5-fold-cross-validation   GBM model

In [311]:
y_pred_CV_GBM = cross_val_predict(best_clf_GBM, x_tr, y_tr, cv=cv)

In [312]:
confusion_matrix_CV_GBM = metrics.confusion_matrix(y_tr, y_pred_CV_GBM, labels=[0,1])
Kappa = metrics.cohen_kappa_score(y_tr, y_pred_CV_GBM, weights='linear')
TN, FP, FN, TP = confusion_matrix_CV_GBM.ravel()
SE = TP/(TP+FN)
SP = TN/(TN+FP)
BA = (SE + SP)/2
print("balanced_accuracy = ", round((BA), 2))
print("SE = ", round((SE), 2))
print("SP = ", round((SP), 2))
print("Kappa = ", round((Kappa), 2))

balanced_accuracy =  0.86
SE =  0.81
SP =  0.91
Kappa =  0.73


In [313]:
pickle.dump(best_clf_RF, open('Models/FP/HDAC1_GBM_ECFP4.pkl', 'wb'))

## 10.3.Y-randomization for  GBM model

In [295]:
permutations = 500
score, permutation_scores, pvalue = permutation_test_score(best_clf_GBM, x_tr, y_tr,
                                                           cv=cv, scoring='balanced_accuracy',
                                                           n_permutations=permutations,
                                                           n_jobs=-1,
                                                           verbose=1,
                                                           random_state=24)
print('True score = ', score.round(2),
      '\nY-randomization = ', np.mean(permutation_scores).round(2),
      '\np-value = ', pvalue.round(4))

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:   17.6s
[Parallel(n_jobs=-1)]: Done 192 tasks      | elapsed:  1.3min
[Parallel(n_jobs=-1)]: Done 442 tasks      | elapsed:  2.8min


True score =  0.85 
Y-randomization =  0.5 
p-value =  0.002


[Parallel(n_jobs=-1)]: Done 500 out of 500 | elapsed:  3.1min finished


In [296]:
max_Y_randomization = round(np.amax(permutation_scores, axis=0), 2) 
max_Y_randomization

0.63

In [297]:
standard_deviation = round(np.std(permutation_scores, axis=0), 3)
standard_deviation

0.045

In [298]:
min_Y_randomization = round(np.min(permutation_scores, axis=0), 2) 
min_Y_randomization

0.38

In [299]:
a = np.greater_equal(permutation_scores, score)
print("Coverage = ", sum(a) / len(a))

Coverage =  0.0


## 10.4. Model GBM: predict for molecules of test set 

In [317]:
y_pred_gbm = best_clf_GBM.predict(x_ts)

In [318]:
y_pred_gbm

array([1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0])

In [320]:
confusion_matrix_GBM = metrics.confusion_matrix(y_ts, y_pred_gbm, labels=[0,1])
Kappa = metrics.cohen_kappa_score(y_ts, y_pred_gbm, weights='linear')
TN, FP, FN, TP = confusion_matrix_GBM.ravel()
SE = TP/(TP+FN)
SP = TN/(TN+FP)
BA = (SE + SP)/2
print("balanced_accuracy = ", round((BA), 2))
print("SE = ", round((SE), 2))
print("SP = ", round((SP), 2))
print("Kappa = ", round((Kappa), 2))

balanced_accuracy =  0.78
SE =  0.76
SP =  0.8
Kappa =  0.56


# 11. SVM model building and validation  

## 11.1.  SVM model building 

In [321]:
scale = StandardScaler().fit(x_tr)
x_tr_sc = scale.transform(x_tr)

In [322]:
joblib.dump(scale, "Models/FP/HDAC1_ws_for SVM.pkl", compress=3)

['Models/FP/HDAC1_ws_for SVM.pkl']

In [323]:
param_grid = {"C": [10 ** i for i in range(0, 5)],
              "gamma": [10 ** i for i in range(-6, 0)]}

In [341]:
svm = GridSearchCV(SVC(kernel='rbf', probability=True), param_grid, n_jobs=2, cv=cv, verbose=1)

In [342]:
svm.fit(x_tr_sc, y_tr)

Fitting 5 folds for each of 30 candidates, totalling 150 fits


GridSearchCV(cv=StratifiedKFold(n_splits=5, random_state=42, shuffle=True),
             estimator=SVC(probability=True), n_jobs=2,
             param_grid={'C': [1, 10, 100, 1000, 10000],
                         'gamma': [1e-06, 1e-05, 0.0001, 0.001, 0.01, 0.1]},
             verbose=1)

In [343]:
best_clf_SVM = svm.best_estimator_

## 11.2. 5-fold-cross-validation  SVM model

In [344]:
y_pred_CV_SVM = cross_val_predict(best_clf_SVM, x_tr_sc, y_tr, cv=cv)

In [345]:
confusion_matrix_CV_SVM = metrics.confusion_matrix(y_tr, y_pred_CV_SVM, labels=[0,1])
Kappa = metrics.cohen_kappa_score(y_tr, y_pred_CV_SVM, weights='linear')
TN, FP, FN, TP = confusion_matrix_CV_SVM.ravel()
SE = TP/(TP+FN)
SP = TN/(TN+FP)
BA = (SE + SP)/2
print("balanced_accuracy = ", round((BA), 2))
print("SE = ", round((SE), 2))
print("SP = ", round((SP), 2))
print("Kappa = ", round((Kappa), 2))

balanced_accuracy =  0.85
SE =  0.79
SP =  0.91
Kappa =  0.7


In [346]:
pickle.dump(best_clf_RF, open('Models/FP/HDAC1_SVM_ECFP4.pkl', 'wb'))

## 11.3.Y-randomization for  SVM model

In [347]:
permutations = 500
score, permutation_scores, pvalue = permutation_test_score(best_clf_SVM, x_tr_sc, y_tr,
                                                           cv=cv, scoring='balanced_accuracy',
                                                           n_permutations=permutations,
                                                           n_jobs=-1,
                                                           verbose=1,
                                                           random_state=24)
print('True score = ', score.round(2),
      '\nY-randomization = ', np.mean(permutation_scores).round(2),
      '\np-value = ', pvalue.round(4))

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:    2.4s
[Parallel(n_jobs=-1)]: Done 192 tasks      | elapsed:    9.2s
[Parallel(n_jobs=-1)]: Done 442 tasks      | elapsed:   20.6s


True score =  0.85 
Y-randomization =  0.5 
p-value =  0.002


[Parallel(n_jobs=-1)]: Done 500 out of 500 | elapsed:   23.2s finished


In [348]:
max_Y_randomization = round(np.amax(permutation_scores, axis=0), 2) 
max_Y_randomization

0.61

In [349]:
standard_deviation = round(np.std(permutation_scores, axis=0), 3)
standard_deviation

0.031

In [350]:
min_Y_randomization = round(np.min(permutation_scores, axis=0), 2) 
min_Y_randomization

0.42

In [352]:
a = np.greater_equal(permutation_scores, score)
print("Coverage = ", sum(a) / len(a))

Coverage =  0.0


## 11.4. Model SVM: predict for molecules of test set  

In [357]:
scale = joblib.load("Models/FP/HDAC1_ws_for SVM.pkl")
x_ts_sc = scale.transform(x_ts)

In [358]:
y_pred_SVM = best_clf_SVM.predict(x_ts_sc)

In [359]:
y_pred_SVM

array([1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0])

In [360]:
confusion_matrix_SVM = metrics.confusion_matrix(y_ts, y_pred_SVM, labels=[0,1])
Kappa = metrics.cohen_kappa_score(y_ts, y_pred_SVM, weights='linear')
TN, FP, FN, TP = confusion_matrix_SVM.ravel()
SE = TP/(TP+FN)
SP = TN/(TN+FP)
BA = (SE + SP)/2
print("balanced_accuracy = ", round((BA), 2))
print("SE = ", round((SE), 2))
print("SP = ", round((SP), 2))
print("Kappa = ", round((Kappa), 2))

balanced_accuracy =  0.8
SE =  0.76
SP =  0.84
Kappa =  0.6


# 12. Consensus modelling

## 12.1.  5-fold CV consensus

In [361]:
y_pred_cv_con = 1 * (((y_pred_CV_RF + y_pred_CV_GBM + y_pred_CV_SVM) / 3) >= 0.5)

In [362]:
confusion_matrix_cv_con = metrics.confusion_matrix(y_tr, y_pred_cv_con, labels=[0,1])
Kappa = metrics.cohen_kappa_score(y_tr, y_pred_cv_con, weights='linear')
TN, FP, FN, TP = confusion_matrix_cv_con.ravel()
SE = TP/(TP+FN)
SP = TN/(TN+FP)
BA = (SE + SP)/2
print("balanced_accuracy = ", round((BA), 2))
print("SE = ", round((SE), 2))
print("SP = ", round((SP), 2))
print("Kappa = ", round((Kappa), 2))

balanced_accuracy =  0.87
SE =  0.83
SP =  0.92
Kappa =  0.75


## 12.2. Test set consensus

In [366]:
pred_c = 1 * (((y_pred_rf + y_pred_gbm + y_pred_SVM) / 3) >= 0.5)

In [367]:
pred_c

array([1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0])

In [368]:
confusion_matrix_GBM = metrics.confusion_matrix(y_ts, pred_c, labels=[0,1])
Kappa = metrics.cohen_kappa_score(y_ts, pred_c, weights='linear')
TN, FP, FN, TP = confusion_matrix_GBM.ravel()
SE = TP/(TP+FN)
SP = TN/(TN+FP)
BA = (SE + SP)/2
print("balanced_accuracy = ", round((BA), 2))
print("SE = ", round((SE), 2))
print("SP = ", round((SP), 2))
print("Kappa = ", round((Kappa), 2))

balanced_accuracy =  0.78
SE =  0.76
SP =  0.8
Kappa =  0.56
