# Importing modules and functions

In [1]:
import numpy as np
import pandas as pd
from rdkit import Chem, DataStructs
from rdkit.Chem import AllChem, Descriptors
from rdkit.Chem import rdFingerprintGenerator
from rdkit.ML.Descriptors import MoleculeDescriptors
import chembl_structure_pipeline
from molvs import standardize_smiles
from sklearn.model_selection import KFold, GridSearchCV
from sklearn.model_selection import permutation_test_score
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import cross_val_predict
from sklearn import metrics
from sklearn.metrics import pairwise_distances
import joblib
import pickle
from numpy import savetxt
from padelpy import from_sdf
from IPython.display import HTML
import matplotlib.pyplot as plt
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score
from catboost import CatBoostRegressor
import warnings
warnings.filterwarnings('ignore')

[05:10:10] Initializing Normalizer


# Data entry and curation work set

In [2]:
uploaded_file_ws="datasets/KRAS_work_from_insilico.sdf"
supplier_ws = Chem.ForwardSDMolSupplier(uploaded_file_ws,sanitize=False)
failed_mols_ws = []
all_mols_ws =[]
wrong_structure_ws=[]
wrong_smiles_ws=[]
y_tr = []
y_bad_index=[]

for i, m in enumerate(supplier_ws):
    structure = Chem.Mol(m)
    all_mols_ws.append(structure)
    y_tr.append(m.GetProp("pIC50_mean"))
    try:
        Chem.SanitizeMol(structure)
    except:
        failed_mols_ws.append(m)
        wrong_smiles_ws.append(Chem.MolToSmiles(m))
        wrong_structure_ws.append(str(i+1))
        y_bad_index.append(i)
print('Original data: ', len(all_mols_ws), 'molecules')
print('Failed data: ', len(failed_mols_ws), 'molecules')
number_ws =[]
for i in range(len(failed_mols_ws)):
        number_ws.append(str(i+1))
bad_molecules_ws = pd.DataFrame({'No. failed molecule in original set': wrong_structure_ws, 'SMILES of wrong structure: ': wrong_smiles_ws, 'No.': number_ws}, index=None)
bad_molecules_ws = bad_molecules_ws.set_index('No.')
bad_molecules_ws

Original data:  452 molecules
Failed data:  0 molecules


Unnamed: 0_level_0,No. failed molecule in original set,SMILES of wrong structure:
No.,Unnamed: 1_level_1,Unnamed: 2_level_1


deleting activity values for substances with incorrect structure

In [3]:
y_tr[:] = [x for i,x in enumerate(y_tr) if i not in y_bad_index]

In [4]:
len(y_tr)

452

# Standardization SDF file for work set

In [5]:
all_mols_ws[:] = [x for i,x in enumerate(all_mols_ws) if i not in y_bad_index] 
records = []
for i in range(len(all_mols_ws)):
    record = Chem.MolToSmiles(all_mols_ws[i])
    records.append(record)

moldf_ws = []
for i,record in enumerate(records):
    standard_record = standardize_smiles(record)
    m = Chem.MolFromSmiles(standard_record)
    moldf_ws.append(m)
    
print('Kept data: ', len(moldf_ws), 'molecules')

Kept data:  452 molecules


In [6]:
moldf_ws=pd.DataFrame(moldf_ws, columns=['Mol'])
moldf_ws

Unnamed: 0,Mol
0,<rdkit.Chem.rdchem.Mol object at 0x00000212A3D...
1,<rdkit.Chem.rdchem.Mol object at 0x00000212A3E...
2,<rdkit.Chem.rdchem.Mol object at 0x00000212A3D...
3,<rdkit.Chem.rdchem.Mol object at 0x00000212A3E...
4,<rdkit.Chem.rdchem.Mol object at 0x00000212A3E...
...,...
447,<rdkit.Chem.rdchem.Mol object at 0x00000212A3E...
448,<rdkit.Chem.rdchem.Mol object at 0x00000212A3E...
449,<rdkit.Chem.rdchem.Mol object at 0x00000212A3E...
450,<rdkit.Chem.rdchem.Mol object at 0x00000212A3E...


# Data entry and curation test set

In [7]:
uploaded_file_ts="datasets/KRAS_test_from_insilico.sdf"
supplier_ts = Chem.ForwardSDMolSupplier(uploaded_file_ts,sanitize=False)
failed_mols_ts = []
all_mols_ts =[]
wrong_structure_ts=[]
wrong_smiles_ts=[]
y_ts = []
y_bad_index=[]
for i, m in enumerate(supplier_ts):
    structure = Chem.Mol(m)
    all_mols_ts.append(structure)
    y_ts.append(m.GetProp("pIC50_mean"))
    try:
        Chem.SanitizeMol(structure)
    except:
        failed_mols_ts.append(m)
        wrong_smiles_ts.append(Chem.MolToSmiles(m))
        wrong_structure_ts.append(str(i+1))
        y_bad_index.append(i)
print('Original data: ', len(all_mols_ts), 'molecules')
print('Failed data: ', len(failed_mols_ts), 'molecules')
number_ts =[]
for i in range(len(failed_mols_ts)):
        number_ts.append(str(i+1))
bad_molecules_ts = pd.DataFrame({'No. failed molecule in original set': wrong_structure_ts, 'SMILES of wrong structure: ': wrong_smiles_ts, 'No.': number_ts}, index=None)
bad_molecules_ts = bad_molecules_ts.set_index('No.')
bad_molecules_ts

Original data:  114 molecules
Failed data:  0 molecules


Unnamed: 0_level_0,No. failed molecule in original set,SMILES of wrong structure:
No.,Unnamed: 1_level_1,Unnamed: 2_level_1


deleting activity values for substances with incorrect structure

In [8]:
y_ts[:] = [x for i,x in enumerate(y_ts) if i not in y_bad_index]

In [9]:
len(y_ts)

114

# Standardization SDF file for test set

In [10]:
all_mols_ts[:] = [x for i,x in enumerate(all_mols_ts) if i not in y_bad_index] 
records = []
for i in range(len(all_mols_ts)):
    record = Chem.MolToSmiles(all_mols_ts[i])
    records.append(record)

moldf_ts = []
for i,record in enumerate(records):
    standard_record = standardize_smiles(record)
    m = Chem.MolFromSmiles(standard_record)
    moldf_ts.append(m)
    
print('Kept data: ', len(moldf_ts), 'molecules')

Kept data:  114 molecules


In [11]:
moldf_ts=pd.DataFrame(moldf_ts, columns=['Mol'])
moldf_ts

Unnamed: 0,Mol
0,<rdkit.Chem.rdchem.Mol object at 0x00000212A3E...
1,<rdkit.Chem.rdchem.Mol object at 0x00000212A3E...
2,<rdkit.Chem.rdchem.Mol object at 0x00000212A3E...
3,<rdkit.Chem.rdchem.Mol object at 0x00000212A3E...
4,<rdkit.Chem.rdchem.Mol object at 0x00000212A3E...
...,...
109,<rdkit.Chem.rdchem.Mol object at 0x00000212A3E...
110,<rdkit.Chem.rdchem.Mol object at 0x00000212A3E...
111,<rdkit.Chem.rdchem.Mol object at 0x00000212A3E...
112,<rdkit.Chem.rdchem.Mol object at 0x00000212A3E...


# Calculation Atom Pairs Fingerprints for work set

In [15]:
import warnings
from rdkit import RDLogger
import numpy as np
import pandas as pd
from rdkit.Chem import AllChem

RDLogger.DisableLog('rdApp.*')

def calcfp(mol, funcFPInfo=dict(size=1024)):
    """
    Calculate Atom Pair Count Fingerprint (RDKit)
    Returns: Pandas Series with 'Bit_*' features (count-based, folded)
    """
    fpSize = int(funcFPInfo.get("size", 1024))
    fpgen = AllChem.GetAtomPairGenerator(fpSize=fpSize)
    sparse_fp = fpgen.GetSparseCountFingerprint(mol)

    arr = np.zeros(fpSize, dtype=int)

    # Fold keys into [0, fpSize-1] to avoid IndexError and accumulate counts
    for key, count in sparse_fp.GetNonzeroElements().items():
        pos = int(key) % fpSize
        arr[pos] += int(count)

    series = pd.Series(arr)
    series = series.add_prefix("Bit_")
    return series

# --- Helper: vectorize for a list/Series of molecules ---
def calc_fp_df(mols, funcFPInfo=dict(size=1024)):
    """Return DataFrame (n_mols x fpSize)"""
    fps = [calcfp(m, funcFPInfo) for m in mols]
    df = pd.DataFrame(fps).reset_index(drop=True)
    return df




In [18]:
# Training set
desc_ws = calc_fp_df(moldf_ws.Mol, dict(size=1024))

In [19]:
savetxt('models/AtomPair/x_tr_AtomPair.csv', desc_ws, delimiter=',')

In [20]:
y_tr = np.array(y_tr, dtype=np.float32)
len(y_tr)

452

# Calculation Atom Pairs Fingerprints for test set

In [21]:
desc_ts = calc_fp_df(moldf_ts.Mol, dict(size=1024))
desc_ts

Unnamed: 0,Bit_0,Bit_1,Bit_2,Bit_3,Bit_4,Bit_5,Bit_6,Bit_7,Bit_8,Bit_9,...,Bit_1014,Bit_1015,Bit_1016,Bit_1017,Bit_1018,Bit_1019,Bit_1020,Bit_1021,Bit_1022,Bit_1023
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
109,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
110,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
111,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
112,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [22]:
y_ts = np.array(y_ts, dtype=np.float32)
len(y_ts)

114

# BASELINE

 ## GradientBoostingRegressor model building and validation

In [23]:
seed = 42

In [24]:
cv=KFold(n_splits=5, random_state=seed, shuffle=True)

In [25]:
%%time
model = CatBoostRegressor()
parameters = {'depth' : [6,8,10],
              'learning_rate' : [0.01, 0.05, 0.1],
              'iterations'    : [100,500, 1000]
              }

grid = GridSearchCV(estimator=model, param_grid = parameters, n_jobs=-1, cv = cv)
grid.fit(desc_ws, y_tr, verbose=False)

CPU times: total: 29.8 s
Wall time: 4min 7s


In [26]:
best_CatBR = grid.best_estimator_

In [27]:
grid.best_params_

{'depth': 8, 'iterations': 500, 'learning_rate': 0.05}

In [28]:
params={'verbose': False}

In [29]:
%%time
y_pred_CV_CatBR = cross_val_predict(best_CatBR, desc_ws, y_tr, cv=cv, params=params)

CPU times: total: 2min 21s
Wall time: 9.59 s


In [30]:
Q2_CV = round(r2_score(y_tr, y_pred_CV_CatBR), 2)
Q2_CV

0.57

In [31]:
RMSE_CV=round(np.sqrt(mean_squared_error(y_tr, y_pred_CV_CatBR)), 2)
RMSE_CV

0.8

In [32]:
y_pred_GBR = best_CatBR.predict(desc_ts)

In [33]:
Q2_TS = round(r2_score(y_ts, y_pred_GBR), 2)
Q2_TS

0.53

In [34]:
RMSE_TS=round(np.sqrt(mean_squared_error(y_ts, y_pred_GBR)), 2)
RMSE_TS

0.85

save the model to disk

In [35]:
pickle.dump(best_CatBR, open('Models/AtomPair/CatBoost_AtomPair.pkl', 'wb'))

load the model from disk

In [123]:
best_CatBR = pickle.load(open('Models/AtomPair/CatBoost_AtomPair.pkl', 'rb'))

# Estimating applicability domain. Method - Euclidian distances, K=1

In [36]:
neighbors_k= pairwise_distances(desc_ws, n_jobs=-1)
neighbors_k.sort(0)

In [37]:
df_tr=pd.DataFrame(neighbors_k)
df_tr

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,442,443,444,445,446,447,448,449,450,451
0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
1,11.575837,15.427249,5.830952,13.038405,1.414214,13.114877,0.000000,41.231056,23.409400,12.961481,...,10.862780,10.344080,4.242641,4.690416,8.366600,11.874342,16.522712,10.723805,10.630146,22.605309
2,11.874342,17.916473,11.313708,13.038405,13.564660,15.165751,4.000000,41.677332,24.494897,15.165751,...,11.916375,12.409674,4.242641,11.313708,10.630146,12.165525,22.293497,14.662878,11.916375,26.944387
3,12.409674,18.841444,11.661904,13.190906,13.674794,15.231546,12.247449,42.520583,35.411862,15.231546,...,14.387495,13.674794,7.071068,14.832397,11.661904,12.884099,24.372115,15.588457,12.409674,28.809721
4,13.638182,19.798990,13.638182,14.764823,17.175564,15.842980,12.328828,42.906876,35.580894,15.588457,...,15.297059,13.674794,7.615773,16.217275,12.409674,14.422205,25.059928,17.291616,12.845233,31.496031
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
447,80.068720,114.052619,118.827606,116.974356,95.268043,113.057507,99.789779,134.655115,114.603665,100.628028,...,91.334550,88.164619,82.939737,86.261231,86.034877,84.148678,104.775951,90.680759,88.323270,95.723560
448,83.486526,117.970335,121.433109,118.042365,95.404402,113.335784,100.508706,135.915415,120.867696,101.483989,...,92.784697,92.265920,86.729464,86.296002,89.526532,89.397987,105.773343,95.666086,93.043001,96.907172
449,94.228446,127.043300,131.556072,129.680376,95.540567,126.336851,100.558441,137.382677,123.377470,101.695624,...,93.637599,92.585096,91.891240,87.412814,92.487837,94.461632,106.254412,96.161323,93.112835,97.380696
450,94.731199,131.244047,134.729358,130.245921,99.352906,127.302789,101.911727,139.240799,131.030531,101.774260,...,95.351980,93.829633,93.386294,92.541882,93.085982,94.604440,107.605762,96.213305,93.648278,99.171569


In [38]:
similarity= neighbors_k

In [39]:
Dmean=np.mean(similarity[1,:])

In [40]:
round(Dmean, 2)

11.78

In [41]:
std=np.std(similarity[1,:])

In [42]:
round(std, 2)

6.73

In [43]:
model_AD_limit=Dmean+std*0.5
print(np.round(model_AD_limit, 2))

15.15


In [44]:
neighbors_k_ts= pairwise_distances(desc_ws,Y=desc_ts, n_jobs=-1)
neighbors_k_ts.sort(0)

In [45]:
x_ts_AD=pd.DataFrame(neighbors_k_ts)
x_ts_AD

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,104,105,106,107,108,109,110,111,112,113
0,5.656854,3.464102,5.291503,14.000000,10.295630,23.579652,0.000000,14.142136,10.677078,1.414214,...,25.534291,17.888544,10.198039,11.135529,0.000000,7.071068,10.488088,14.866069,10.862780,12.961481
1,18.841444,9.219544,13.038405,18.574176,10.770330,26.248809,10.770330,14.933185,13.114877,10.677078,...,26.589472,22.847319,12.489996,12.165525,8.366600,11.916375,11.575837,20.469489,12.000000,14.212670
2,19.467922,12.489996,13.190906,19.026298,10.954451,27.513633,10.954451,16.309506,18.601075,10.862780,...,27.874720,44.124823,13.564660,12.489996,10.630146,14.177447,12.000000,20.639767,12.649111,15.264338
3,20.712315,12.727922,14.071247,20.712315,11.045361,29.512709,12.083046,16.881943,25.278449,12.083046,...,29.223278,45.967380,14.832397,14.106736,11.661904,15.000000,12.489996,23.345235,13.038405,15.297059
4,21.563859,12.727922,14.933185,21.517435,13.341664,29.512709,12.727922,17.464249,25.436195,12.649111,...,30.248967,46.368092,15.842980,14.560220,12.409674,16.552945,13.820275,29.799329,13.341664,17.435596
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
447,102.176318,125.654288,100.692602,92.455395,115.529217,136.502747,98.295473,114.503275,102.771591,98.295473,...,110.199819,120.656537,83.898749,85.416626,86.034877,89.988888,82.012194,115.321290,98.071403,92.795474
448,104.417431,127.212421,101.666120,92.487837,117.388245,139.154590,100.019998,117.102519,102.975725,99.879928,...,111.040533,121.078487,88.769364,86.249638,89.526532,90.194235,85.988371,115.943952,99.909959,93.021503
449,117.724254,135.786597,101.867561,94.884140,130.437725,150.933760,100.094955,129.058126,103.817147,100.134909,...,112.472219,121.181682,93.429118,86.579443,92.487837,91.181138,93.973400,116.365803,112.227448,93.107465
450,118.970585,140.914868,101.877377,96.202911,131.175455,152.597510,100.990099,130.786085,104.388697,101.089070,...,113.520923,122.816937,94.678403,88.396833,93.085982,91.350972,94.148818,116.511802,115.438295,93.834962


In [46]:
similarity_ts= neighbors_k_ts
cpd_AD=similarity_ts[0,:]
cpd_value = np.round(cpd_AD, 3)
print(cpd_value)

[ 5.657  3.464  5.292 14.    10.296 23.58   0.    14.142 10.677  1.414
 28.16  14.107 10.198 15.427 14.036 13.528 16.673 10.583  1.414  9.22
 12.41  10.198 12.649 14.177 10.77  12.124 24.88  16.34   8.     6.
  1.414  8.832 17.748 11.705  4.472 13.82  15.684 12.247 11.576  0.
 10.536 19.875  8.718  1.414 12.    10.954 13.528  0.    14.353  6.928
 12.845 10.198 11.533 13.115 18.708 15.875 14.765  7.483 23.749 15.1
 10.149 20.174 12.649  1.414 10.247 12.369  6.782  9.165 10.    11.874
 12.    22.694 20.396  3.464 14.832 16.186  4.     9.849  6.782 14.866
 19.416 17.833  0.    20.224  9.487  7.211 13.856 17.916 36.014  0.
 13.077 10.296  4.899  4.     1.414  1.414 14.457 10.198 15.362  9.798
 12.767  9.798 11.832 14.799 25.534 17.889 10.198 11.136  0.     7.071
 10.488 14.866 10.863 12.961]


In [47]:
cpd_AD = np.where(cpd_value <= model_AD_limit, True, False)
print(cpd_AD)

[ True  True  True  True  True False  True  True  True  True False  True
  True False  True  True False  True  True  True  True  True  True  True
  True  True False False  True  True  True  True False  True  True  True
 False  True  True  True  True False  True  True  True  True  True  True
  True  True  True  True  True  True False False  True  True False  True
  True False  True  True  True  True  True  True  True  True  True False
 False  True  True False  True  True  True  True False False  True False
  True  True  True False False  True  True  True  True  True  True  True
  True  True False  True  True  True  True  True False False  True  True
  True  True  True  True  True  True]


In [48]:
print("Coverage = ", round(sum(cpd_AD) / len(cpd_AD),2))

Coverage =  0.79


In [49]:
print("Indices of substances included in AD = ", np.where(cpd_AD != 0)[0])

Indices of substances included in AD =  [  0   1   2   3   4   6   7   8   9  11  12  14  15  17  18  19  20  21
  22  23  24  25  28  29  30  31  33  34  35  37  38  39  40  42  43  44
  45  46  47  48  49  50  51  52  53  56  57  59  60  62  63  64  65  66
  67  68  69  70  73  74  76  77  78  79  82  84  85  86  89  90  91  92
  93  94  95  96  97  99 100 101 102 103 106 107 108 109 110 111 112 113]


In [50]:
out_Ad=list(np.where(cpd_AD == 0)[0])

# Prediction only for molecules included in  AD

In [51]:
y_pred_GBR_ad=list(y_pred_GBR)

In [52]:
y_pred_GBR_ad[:] = [x for i,x in enumerate(y_pred_GBR_ad) if i not in out_Ad]

In [53]:
len(y_pred_GBR_ad)

90

In [54]:
y_ts_ad=list(y_ts)

In [55]:
y_ts_ad[:] = [x for i,x in enumerate(y_ts_ad) if i not in out_Ad]

In [56]:
len(y_ts_ad)

90

In [57]:
Q2_TS = round(r2_score(y_ts_ad, y_pred_GBR_ad), 2)
Q2_TS

0.52

In [58]:
RMSE_TS=round(np.sqrt(mean_squared_error(y_ts_ad, y_pred_GBR_ad)), 2)
RMSE_TS

0.87

# SVM model building and validation

In [59]:
from sklearn.svm import SVR

In [60]:
param_grid = {"C": [10 ** i for i in range(0, 5)],
              "gamma": [10 ** i for i in range(-6, 0)]}

In [61]:
seed = 42
cv=KFold(n_splits=5, random_state=seed, shuffle=True)

In [62]:
svm = GridSearchCV(SVR(C=1.0, epsilon=0.2), param_grid, n_jobs=-1, cv=cv, verbose=1)

In [63]:
svm.fit(desc_ws, y_tr)

Fitting 5 folds for each of 30 candidates, totalling 150 fits


In [64]:
svm.best_params_
best_svm = svm.best_estimator_

In [65]:
svm.best_params_

{'C': 100, 'gamma': 0.0001}

In [66]:
y_pred_CV_svm = cross_val_predict(best_svm, desc_ws, y_tr, cv=cv)

In [67]:
Q2_CV = round(r2_score(y_tr, y_pred_CV_svm), 2)
Q2_CV

0.61

In [68]:
RMSE_CV=round(np.sqrt(mean_squared_error(y_tr, y_pred_CV_svm)), 2)
RMSE_CV

0.76

#  Prediction for test set's molecules

In [69]:
x_ts = np.array(desc_ts, dtype=np.float32)
y_ts = np.array(y_ts, dtype=np.float32)

In [70]:
y_pred_svm = best_svm.predict(x_ts)

In [71]:
Q2_TS = round(r2_score(y_ts, y_pred_svm), 2)
Q2_TS

0.56

In [72]:
RMSE_TS=round(np.sqrt(mean_squared_error(y_ts, y_pred_svm)), 2)
RMSE_TS

0.82

save the model to disk

In [71]:
pickle.dump(best_svm, open('Models/AtomPair/SVM_AtomPair.pkl', 'wb'))

load the model from disk

In [165]:
best_svm = pickle.load(open('Models/AtomPair/SVM_AtomPair.pkl', 'rb'))

# Estimating applicability domain. Method - Euclidian distances, K=1

In [73]:
neighbors_k= pairwise_distances(desc_ws, n_jobs=-1)
neighbors_k.sort(0)

In [74]:
df_tr=pd.DataFrame(neighbors_k)
df_tr

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,442,443,444,445,446,447,448,449,450,451
0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
1,11.575837,15.427249,5.830952,13.038405,1.414214,13.114877,0.000000,41.231056,23.409400,12.961481,...,10.862780,10.344080,4.242641,4.690416,8.366600,11.874342,16.522712,10.723805,10.630146,22.605309
2,11.874342,17.916473,11.313708,13.038405,13.564660,15.165751,4.000000,41.677332,24.494897,15.165751,...,11.916375,12.409674,4.242641,11.313708,10.630146,12.165525,22.293497,14.662878,11.916375,26.944387
3,12.409674,18.841444,11.661904,13.190906,13.674794,15.231546,12.247449,42.520583,35.411862,15.231546,...,14.387495,13.674794,7.071068,14.832397,11.661904,12.884099,24.372115,15.588457,12.409674,28.809721
4,13.638182,19.798990,13.638182,14.764823,17.175564,15.842980,12.328828,42.906876,35.580894,15.588457,...,15.297059,13.674794,7.615773,16.217275,12.409674,14.422205,25.059928,17.291616,12.845233,31.496031
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
447,80.068720,114.052619,118.827606,116.974356,95.268043,113.057507,99.789779,134.655115,114.603665,100.628028,...,91.334550,88.164619,82.939737,86.261231,86.034877,84.148678,104.775951,90.680759,88.323270,95.723560
448,83.486526,117.970335,121.433109,118.042365,95.404402,113.335784,100.508706,135.915415,120.867696,101.483989,...,92.784697,92.265920,86.729464,86.296002,89.526532,89.397987,105.773343,95.666086,93.043001,96.907172
449,94.228446,127.043300,131.556072,129.680376,95.540567,126.336851,100.558441,137.382677,123.377470,101.695624,...,93.637599,92.585096,91.891240,87.412814,92.487837,94.461632,106.254412,96.161323,93.112835,97.380696
450,94.731199,131.244047,134.729358,130.245921,99.352906,127.302789,101.911727,139.240799,131.030531,101.774260,...,95.351980,93.829633,93.386294,92.541882,93.085982,94.604440,107.605762,96.213305,93.648278,99.171569


In [75]:
similarity= neighbors_k

In [76]:
Dmean=np.mean(similarity[1,:])

In [77]:
round(Dmean, 2)

11.78

In [78]:
std=np.std(similarity[1,:])

In [79]:
round(std, 2)

6.73

In [80]:
model_AD_limit=Dmean+std*0.5
print(np.round(model_AD_limit, 2))

15.15


In [81]:
neighbors_k_ts= pairwise_distances(desc_ws,Y=x_ts, n_jobs=-1)
neighbors_k_ts.sort(0)

In [82]:
x_ts_AD=pd.DataFrame(neighbors_k_ts)
x_ts_AD

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,104,105,106,107,108,109,110,111,112,113
0,5.656854,3.464102,5.291503,14.000000,10.295630,23.579652,0.000000,14.142136,10.677078,1.414214,...,25.534291,17.888544,10.198039,11.135529,0.000000,7.071068,10.488088,14.866069,10.862780,12.961481
1,18.841444,9.219544,13.038405,18.574176,10.770330,26.248809,10.770330,14.933185,13.114877,10.677078,...,26.589472,22.847319,12.489996,12.165525,8.366600,11.916375,11.575837,20.469489,12.000000,14.212670
2,19.467922,12.489996,13.190906,19.026298,10.954451,27.513633,10.954451,16.309506,18.601075,10.862780,...,27.874720,44.124823,13.564660,12.489996,10.630146,14.177447,12.000000,20.639767,12.649111,15.264338
3,20.712315,12.727922,14.071247,20.712315,11.045361,29.512709,12.083046,16.881943,25.278449,12.083046,...,29.223278,45.967380,14.832397,14.106736,11.661904,15.000000,12.489996,23.345235,13.038405,15.297059
4,21.563859,12.727922,14.933185,21.517435,13.341664,29.512709,12.727922,17.464249,25.436195,12.649111,...,30.248967,46.368092,15.842980,14.560220,12.409674,16.552945,13.820275,29.799329,13.341664,17.435596
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
447,102.176318,125.654288,100.692602,92.455395,115.529217,136.502747,98.295473,114.503275,102.771591,98.295473,...,110.199819,120.656537,83.898749,85.416626,86.034877,89.988888,82.012194,115.321290,98.071403,92.795474
448,104.417431,127.212421,101.666120,92.487837,117.388245,139.154590,100.019998,117.102519,102.975725,99.879928,...,111.040533,121.078487,88.769364,86.249638,89.526532,90.194235,85.988371,115.943952,99.909959,93.021503
449,117.724254,135.786597,101.867561,94.884140,130.437725,150.933760,100.094955,129.058126,103.817147,100.134909,...,112.472219,121.181682,93.429118,86.579443,92.487837,91.181138,93.973400,116.365803,112.227448,93.107465
450,118.970585,140.914868,101.877377,96.202911,131.175455,152.597510,100.990099,130.786085,104.388697,101.089070,...,113.520923,122.816937,94.678403,88.396833,93.085982,91.350972,94.148818,116.511802,115.438295,93.834962


In [83]:
similarity_ts= neighbors_k_ts
cpd_AD=similarity_ts[0,:]
cpd_value = np.round(cpd_AD, 3)
print(cpd_value)

[ 5.657  3.464  5.292 14.    10.296 23.58   0.    14.142 10.677  1.414
 28.16  14.107 10.198 15.427 14.036 13.528 16.673 10.583  1.414  9.22
 12.41  10.198 12.649 14.177 10.77  12.124 24.88  16.34   8.     6.
  1.414  8.832 17.748 11.705  4.472 13.82  15.684 12.247 11.576  0.
 10.536 19.875  8.718  1.414 12.    10.954 13.528  0.    14.353  6.928
 12.845 10.198 11.533 13.115 18.708 15.875 14.765  7.483 23.749 15.1
 10.149 20.174 12.649  1.414 10.247 12.369  6.782  9.165 10.    11.874
 12.    22.694 20.396  3.464 14.832 16.186  4.     9.849  6.782 14.866
 19.416 17.833  0.    20.224  9.487  7.211 13.856 17.916 36.014  0.
 13.077 10.296  4.899  4.     1.414  1.414 14.457 10.198 15.362  9.798
 12.767  9.798 11.832 14.799 25.534 17.889 10.198 11.136  0.     7.071
 10.488 14.866 10.863 12.961]


In [84]:
cpd_AD = np.where(cpd_value <= model_AD_limit, True, False)
print(cpd_AD)

[ True  True  True  True  True False  True  True  True  True False  True
  True False  True  True False  True  True  True  True  True  True  True
  True  True False False  True  True  True  True False  True  True  True
 False  True  True  True  True False  True  True  True  True  True  True
  True  True  True  True  True  True False False  True  True False  True
  True False  True  True  True  True  True  True  True  True  True False
 False  True  True False  True  True  True  True False False  True False
  True  True  True False False  True  True  True  True  True  True  True
  True  True False  True  True  True  True  True False False  True  True
  True  True  True  True  True  True]


In [85]:
print("Coverage = ", round(sum(cpd_AD) / len(cpd_AD), 2))

Coverage =  0.79


In [86]:
print("Indices of substances included in AD = ", np.where(cpd_AD != 0)[0])

Indices of substances included in AD =  [  0   1   2   3   4   6   7   8   9  11  12  14  15  17  18  19  20  21
  22  23  24  25  28  29  30  31  33  34  35  37  38  39  40  42  43  44
  45  46  47  48  49  50  51  52  53  56  57  59  60  62  63  64  65  66
  67  68  69  70  73  74  76  77  78  79  82  84  85  86  89  90  91  92
  93  94  95  96  97  99 100 101 102 103 106 107 108 109 110 111 112 113]


In [87]:
out_Ad=list(np.where(cpd_AD == 0)[0])

# Prediction only for molecules included in  AD

In [88]:
y_pred_svm_ad=list(y_pred_svm)

In [89]:
y_pred_svm_ad[:] = [x for i,x in enumerate(y_pred_svm_ad) if i not in out_Ad]

In [90]:
len(y_pred_svm_ad)

90

In [91]:
y_ts_ad=list(y_ts)

In [92]:
y_ts_ad[:] = [x for i,x in enumerate(y_ts_ad) if i not in out_Ad]

In [93]:
len(y_ts_ad)

90

In [94]:
Q2_TS = round(r2_score(y_ts_ad, y_pred_svm_ad), 2)
Q2_TS

0.61

In [95]:
RMSE_TS=round(np.sqrt(mean_squared_error(y_ts_ad, y_pred_svm_ad)), 2)
RMSE_TS

0.79

# Multi-layer Perceptron regressor

In [96]:
from sklearn.neural_network import MLPRegressor

In [97]:
param_grid ={"hidden_layer_sizes": [(400, 300, 200, 100),(100, 100, 100), (10, 10, 10),(50,)], "activation": ["tanh", "relu"], "solver": ["lbfgs", "sgd", "adam"], "alpha": [0.00005,0.0005], 'max_iter': [1000, 2000]}

In [98]:
m = GridSearchCV(MLPRegressor(), param_grid, n_jobs=-1, cv=cv, verbose=1)

In [99]:
m.fit(desc_ws, y_tr)

Fitting 5 folds for each of 96 candidates, totalling 480 fits


In [100]:
best_MLPR = m.best_estimator_

In [101]:
y_pred_CV_MLPR = cross_val_predict(best_MLPR, desc_ws, y_tr, cv=cv)

In [102]:
Q2_CV = round(r2_score(y_tr, y_pred_CV_MLPR), 2)
Q2_CV

0.57

In [103]:
RMSE_CV=round(np.sqrt(mean_squared_error(y_tr, y_pred_CV_MLPR)), 2)
RMSE_CV

0.8

# Prediction for test set's molecules

In [104]:
y_pred_MLPR = best_MLPR.predict(desc_ts)

In [105]:
Q2_TS = round(r2_score(y_ts, y_pred_MLPR), 2)
Q2_TS

0.59

In [106]:
RMSE_TS=round(np.sqrt(mean_squared_error(y_ts, y_pred_MLPR)), 2)
RMSE_TS

0.79

save the model to disk

In [107]:
pickle.dump(best_MLPR, open('models/AtomPair/MLPR_AtomPair.pkl', 'wb'))

#  Estimating applicability domain. Method - Euclidian distances, K=1

In [108]:
neighbors_k= pairwise_distances(desc_ws, n_jobs=-1)
neighbors_k.sort(0)

In [109]:
df_tr=pd.DataFrame(neighbors_k)
df_tr

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,442,443,444,445,446,447,448,449,450,451
0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
1,11.575837,15.427249,5.830952,13.038405,1.414214,13.114877,0.000000,41.231056,23.409400,12.961481,...,10.862780,10.344080,4.242641,4.690416,8.366600,11.874342,16.522712,10.723805,10.630146,22.605309
2,11.874342,17.916473,11.313708,13.038405,13.564660,15.165751,4.000000,41.677332,24.494897,15.165751,...,11.916375,12.409674,4.242641,11.313708,10.630146,12.165525,22.293497,14.662878,11.916375,26.944387
3,12.409674,18.841444,11.661904,13.190906,13.674794,15.231546,12.247449,42.520583,35.411862,15.231546,...,14.387495,13.674794,7.071068,14.832397,11.661904,12.884099,24.372115,15.588457,12.409674,28.809721
4,13.638182,19.798990,13.638182,14.764823,17.175564,15.842980,12.328828,42.906876,35.580894,15.588457,...,15.297059,13.674794,7.615773,16.217275,12.409674,14.422205,25.059928,17.291616,12.845233,31.496031
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
447,80.068720,114.052619,118.827606,116.974356,95.268043,113.057507,99.789779,134.655115,114.603665,100.628028,...,91.334550,88.164619,82.939737,86.261231,86.034877,84.148678,104.775951,90.680759,88.323270,95.723560
448,83.486526,117.970335,121.433109,118.042365,95.404402,113.335784,100.508706,135.915415,120.867696,101.483989,...,92.784697,92.265920,86.729464,86.296002,89.526532,89.397987,105.773343,95.666086,93.043001,96.907172
449,94.228446,127.043300,131.556072,129.680376,95.540567,126.336851,100.558441,137.382677,123.377470,101.695624,...,93.637599,92.585096,91.891240,87.412814,92.487837,94.461632,106.254412,96.161323,93.112835,97.380696
450,94.731199,131.244047,134.729358,130.245921,99.352906,127.302789,101.911727,139.240799,131.030531,101.774260,...,95.351980,93.829633,93.386294,92.541882,93.085982,94.604440,107.605762,96.213305,93.648278,99.171569


In [110]:
similarity= neighbors_k

In [111]:
Dmean=np.mean(similarity[1,:])

In [112]:
round(Dmean, 2)

11.78

In [113]:
std=np.std(similarity[1,:])

In [114]:
round(std, 2)

6.73

In [115]:
model_AD_limit=Dmean+std*0.5
print(np.round(model_AD_limit, 2))

15.15


In [116]:
neighbors_k_ts= pairwise_distances(desc_ws,Y=desc_ts, n_jobs=-1)
neighbors_k_ts.sort(0)

In [117]:
x_ts_AD=pd.DataFrame(neighbors_k_ts)
x_ts_AD

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,104,105,106,107,108,109,110,111,112,113
0,5.656854,3.464102,5.291503,14.000000,10.295630,23.579652,0.000000,14.142136,10.677078,1.414214,...,25.534291,17.888544,10.198039,11.135529,0.000000,7.071068,10.488088,14.866069,10.862780,12.961481
1,18.841444,9.219544,13.038405,18.574176,10.770330,26.248809,10.770330,14.933185,13.114877,10.677078,...,26.589472,22.847319,12.489996,12.165525,8.366600,11.916375,11.575837,20.469489,12.000000,14.212670
2,19.467922,12.489996,13.190906,19.026298,10.954451,27.513633,10.954451,16.309506,18.601075,10.862780,...,27.874720,44.124823,13.564660,12.489996,10.630146,14.177447,12.000000,20.639767,12.649111,15.264338
3,20.712315,12.727922,14.071247,20.712315,11.045361,29.512709,12.083046,16.881943,25.278449,12.083046,...,29.223278,45.967380,14.832397,14.106736,11.661904,15.000000,12.489996,23.345235,13.038405,15.297059
4,21.563859,12.727922,14.933185,21.517435,13.341664,29.512709,12.727922,17.464249,25.436195,12.649111,...,30.248967,46.368092,15.842980,14.560220,12.409674,16.552945,13.820275,29.799329,13.341664,17.435596
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
447,102.176318,125.654288,100.692602,92.455395,115.529217,136.502747,98.295473,114.503275,102.771591,98.295473,...,110.199819,120.656537,83.898749,85.416626,86.034877,89.988888,82.012194,115.321290,98.071403,92.795474
448,104.417431,127.212421,101.666120,92.487837,117.388245,139.154590,100.019998,117.102519,102.975725,99.879928,...,111.040533,121.078487,88.769364,86.249638,89.526532,90.194235,85.988371,115.943952,99.909959,93.021503
449,117.724254,135.786597,101.867561,94.884140,130.437725,150.933760,100.094955,129.058126,103.817147,100.134909,...,112.472219,121.181682,93.429118,86.579443,92.487837,91.181138,93.973400,116.365803,112.227448,93.107465
450,118.970585,140.914868,101.877377,96.202911,131.175455,152.597510,100.990099,130.786085,104.388697,101.089070,...,113.520923,122.816937,94.678403,88.396833,93.085982,91.350972,94.148818,116.511802,115.438295,93.834962


In [118]:
similarity_ts= neighbors_k_ts
cpd_AD=similarity_ts[0,:]
cpd_value = np.round(cpd_AD, 3)
print(cpd_value)

[ 5.657  3.464  5.292 14.    10.296 23.58   0.    14.142 10.677  1.414
 28.16  14.107 10.198 15.427 14.036 13.528 16.673 10.583  1.414  9.22
 12.41  10.198 12.649 14.177 10.77  12.124 24.88  16.34   8.     6.
  1.414  8.832 17.748 11.705  4.472 13.82  15.684 12.247 11.576  0.
 10.536 19.875  8.718  1.414 12.    10.954 13.528  0.    14.353  6.928
 12.845 10.198 11.533 13.115 18.708 15.875 14.765  7.483 23.749 15.1
 10.149 20.174 12.649  1.414 10.247 12.369  6.782  9.165 10.    11.874
 12.    22.694 20.396  3.464 14.832 16.186  4.     9.849  6.782 14.866
 19.416 17.833  0.    20.224  9.487  7.211 13.856 17.916 36.014  0.
 13.077 10.296  4.899  4.     1.414  1.414 14.457 10.198 15.362  9.798
 12.767  9.798 11.832 14.799 25.534 17.889 10.198 11.136  0.     7.071
 10.488 14.866 10.863 12.961]


In [119]:
cpd_AD = np.where(cpd_value <= model_AD_limit, True, False)
print(cpd_AD)

[ True  True  True  True  True False  True  True  True  True False  True
  True False  True  True False  True  True  True  True  True  True  True
  True  True False False  True  True  True  True False  True  True  True
 False  True  True  True  True False  True  True  True  True  True  True
  True  True  True  True  True  True False False  True  True False  True
  True False  True  True  True  True  True  True  True  True  True False
 False  True  True False  True  True  True  True False False  True False
  True  True  True False False  True  True  True  True  True  True  True
  True  True False  True  True  True  True  True False False  True  True
  True  True  True  True  True  True]


In [120]:
print("Coverage = ", round(sum(cpd_AD) / len(cpd_AD), 2))

Coverage =  0.79


In [121]:
print("Indices of substances included in AD = ", np.where(cpd_AD != 0)[0])

Indices of substances included in AD =  [  0   1   2   3   4   6   7   8   9  11  12  14  15  17  18  19  20  21
  22  23  24  25  28  29  30  31  33  34  35  37  38  39  40  42  43  44
  45  46  47  48  49  50  51  52  53  56  57  59  60  62  63  64  65  66
  67  68  69  70  73  74  76  77  78  79  82  84  85  86  89  90  91  92
  93  94  95  96  97  99 100 101 102 103 106 107 108 109 110 111 112 113]


In [122]:
out_Ad=list(np.where(cpd_AD == 0)[0])

# Prediction only for molecules included in  AD

In [123]:
y_pred_MLPR_ad=list(y_pred_MLPR)

In [124]:
y_pred_MLPR_ad[:] = [x for i,x in enumerate(y_pred_MLPR_ad) if i not in out_Ad]

In [125]:
len(y_pred_MLPR_ad)

90

In [126]:
y_ts_ad=list(y_ts)

In [127]:
y_ts_ad[:] = [x for i,x in enumerate(y_ts_ad) if i not in out_Ad]

In [128]:
len(y_ts_ad)

90

In [129]:
Q2_TS = round(r2_score(y_ts_ad, y_pred_MLPR_ad), 2)
Q2_TS

0.58

In [130]:
RMSE_TS=round(np.sqrt(mean_squared_error(y_ts_ad, y_pred_MLPR_ad)), 2)
RMSE_TS

0.82