# 1. Importing modules and functions

In [1]:
from rdkit import Chem, DataStructs
from rdkit.Chem import AllChem, Descriptors
from rdkit.ML.Descriptors import MoleculeDescriptors
from molvs import standardize_smiles
from copy import deepcopy
import numpy as np
import pandas as pd
from sklearn.svm import SVC
from sklearn.model_selection import KFold, GridSearchCV
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import r2_score
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.svm import SVR
from sklearn.model_selection import permutation_test_score
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import cross_val_predict
from sklearn import metrics
from sklearn.metrics import pairwise_distances
import joblib
import pickle
from numpy import savetxt
from padelpy import from_sdf
import shap
from tqdm.notebook import tqdm

In [2]:
def convert_smi_to_canon_smi(smi):
    
    try:
        canon_smi = Chem.MolToSmiles(Chem.MolFromSmiles(smi),isomericSmiles = False)
    except:
        canon_smi='wrong_smiles'
    return canon_smi

In [3]:
def standart(smi):
    global m
    if smi!='wrong_smiles':
        try:
            smiles=standardize_smiles(smi)
            m = Chem.MolFromSmiles(smi)
        except:
            smi='error kekule'
    else:
        m = 'check the smiles'
    return m

## Load data and curation work set

In [4]:
# Set file path
df_ws=pd.read_csv('rat_oral_LD50_WS.csv')
df_ws

Unnamed: 0,CAS_Number,SMILES,pLD50,"LD50,mg/kg"
0,626-48-2,Cc1cc(=O)[nH]c(=O)[nH]1,0.291207,64463.0000
1,27849-94-1,CC(CCc1ccc2c(c1)OCO2)NN,0.440660,75449.0000
2,110-54-3,CCCCCC,0.537460,24980.0000
3,1330-92-3,CCCCC(CC)COC(=O)C1=C(C(=O)OCC(CC)CCCC)CCCC1,0.539250,113917.0000
4,57-55-6,CC(O)CO,0.580330,19989.0000
...,...,...,...,...
7707,56073-10-0,O=c1oc2ccccc2c(O)c1C1CC(c2ccc(-c3ccc(Br)cc3)cc...,6.514700,0.1596
7708,130209-82-4,CC(C)OC(=O)CCC/C=C/CC1C(O)CC(O)C1CCC(O)CCc1ccccc1,6.937100,0.0500
7709,83805-11-2,C=C1/C(=C\C=C2/CCCC3(C)C2CCC3C(C)CCCC(O)(C(F)(...,7.099700,0.0417
7710,1746-01-6,Clc1cc2c(cc1Cl)Oc1cc(Cl)c(Cl)cc1O2,7.206800,0.0199


 Convert a SMILES string to canonical SMILES

In [5]:
df_ws1 = deepcopy(df_ws)
df_ws1["SMILES"] = df_ws1.apply(lambda x: convert_smi_to_canon_smi(x.SMILES), axis=1)
df_ws1

Unnamed: 0,CAS_Number,SMILES,pLD50,"LD50,mg/kg"
0,626-48-2,Cc1cc(=O)[nH]c(=O)[nH]1,0.291207,64463.0000
1,27849-94-1,CC(CCc1ccc2c(c1)OCO2)NN,0.440660,75449.0000
2,110-54-3,CCCCCC,0.537460,24980.0000
3,1330-92-3,CCCCC(CC)COC(=O)C1=C(C(=O)OCC(CC)CCCC)CCCC1,0.539250,113917.0000
4,57-55-6,CC(O)CO,0.580330,19989.0000
...,...,...,...,...
7707,56073-10-0,O=c1oc2ccccc2c(O)c1C1CC(c2ccc(-c3ccc(Br)cc3)cc...,6.514700,0.1596
7708,130209-82-4,CC(C)OC(=O)CCCC=CCC1C(O)CC(O)C1CCC(O)CCc1ccccc1,6.937100,0.0500
7709,83805-11-2,C=C1C(=CC=C2CCCC3(C)C2CCC3C(C)CCCC(O)(C(F)(F)F...,7.099700,0.0417
7710,1746-01-6,Clc1cc2c(cc1Cl)Oc1cc(Cl)c(Cl)cc1O2,7.206800,0.0199


In [6]:
print('Original data: ', len(df_ws), 'molecules')
print('Failed data: ', len(df_ws1[df_ws1['SMILES']=='wrong_smiles']), 'molecules')

Original data:  7712 molecules
Failed data:  0 molecules


In [7]:
index=df_ws1.index[df_ws1['SMILES']=='wrong_smiles'].tolist()
wrong_smiles=df_ws.iloc[index]
wrong_smiles=wrong_smiles.SMILES
number=[x+1 for x in index]
bad_molecules = pd.DataFrame({'No. failed smiles in original set': number, 'SMILES of wrong structure: ': wrong_smiles}, index=None)
bad_molecules = bad_molecules.set_index('No. failed smiles in original set')
bad_molecules

Unnamed: 0_level_0,SMILES of wrong structure:
No. failed smiles in original set,Unnamed: 1_level_1


##  Standardization  for work set

In [8]:
df_ws1["Molecule"] = df_ws1.apply(lambda x: standart(x.SMILES), axis=1)
moldf_ws=df_ws1[df_ws1['SMILES']!='wrong_smiles']
print('Kept data: ', len(moldf_ws), 'molecules')

Kept data:  7712 molecules


In [9]:
moldf_ws

Unnamed: 0,CAS_Number,SMILES,pLD50,"LD50,mg/kg",Molecule
0,626-48-2,Cc1cc(=O)[nH]c(=O)[nH]1,0.291207,64463.0000,<rdkit.Chem.rdchem.Mol object at 0x0000018B315...
1,27849-94-1,CC(CCc1ccc2c(c1)OCO2)NN,0.440660,75449.0000,<rdkit.Chem.rdchem.Mol object at 0x0000018B315...
2,110-54-3,CCCCCC,0.537460,24980.0000,<rdkit.Chem.rdchem.Mol object at 0x0000018B315...
3,1330-92-3,CCCCC(CC)COC(=O)C1=C(C(=O)OCC(CC)CCCC)CCCC1,0.539250,113917.0000,<rdkit.Chem.rdchem.Mol object at 0x0000018B315...
4,57-55-6,CC(O)CO,0.580330,19989.0000,<rdkit.Chem.rdchem.Mol object at 0x0000018B317...
...,...,...,...,...,...
7707,56073-10-0,O=c1oc2ccccc2c(O)c1C1CC(c2ccc(-c3ccc(Br)cc3)cc...,6.514700,0.1596,<rdkit.Chem.rdchem.Mol object at 0x0000018B327...
7708,130209-82-4,CC(C)OC(=O)CCCC=CCC1C(O)CC(O)C1CCC(O)CCc1ccccc1,6.937100,0.0500,<rdkit.Chem.rdchem.Mol object at 0x0000018B327...
7709,83805-11-2,C=C1C(=CC=C2CCCC3(C)C2CCC3C(C)CCCC(O)(C(F)(F)F...,7.099700,0.0417,<rdkit.Chem.rdchem.Mol object at 0x0000018B327...
7710,1746-01-6,Clc1cc2c(cc1Cl)Oc1cc(Cl)c(Cl)cc1O2,7.206800,0.0199,<rdkit.Chem.rdchem.Mol object at 0x0000018B327...


In [10]:
y_tr=moldf_ws.pLD50
y_tr

0       0.291207
1       0.440660
2       0.537460
3       0.539250
4       0.580330
          ...   
7707    6.514700
7708    6.937100
7709    7.099700
7710    7.206800
7711    7.602600
Name: pLD50, Length: 7712, dtype: float64

In [11]:
df = pd.DataFrame(moldf_ws, columns=["SMILES"])
df

Unnamed: 0,SMILES
0,Cc1cc(=O)[nH]c(=O)[nH]1
1,CC(CCc1ccc2c(c1)OCO2)NN
2,CCCCCC
3,CCCCC(CC)COC(=O)C1=C(C(=O)OCC(CC)CCCC)CCCC1
4,CC(O)CO
...,...
7707,O=c1oc2ccccc2c(O)c1C1CC(c2ccc(-c3ccc(Br)cc3)cc...
7708,CC(C)OC(=O)CCCC=CCC1C(O)CC(O)C1CCC(O)CCc1ccccc1
7709,C=C1C(=CC=C2CCCC3(C)C2CCC3C(C)CCCC(O)(C(F)(F)F...
7710,Clc1cc2c(cc1Cl)Oc1cc(Cl)c(Cl)cc1O2


In [12]:
df.to_csv('datasets/molecule_ws.smi', sep=',', index=False, header=False)

##  Load data and curation test set

In [13]:
df_ts=pd.read_csv('rat_oral_LD50_TS.csv')
df_ts

Unnamed: 0,CAS_Number,SMILES,pLD50,"LD50,mg/kg"
0,7782-40-3,C,0.017765,15388.8000
1,2842-38-8,OCCNC1CCCCC1,0.572840,38274.0000
2,66257-53-2,NC(=O)C(=O)O,0.624490,21133.0000
3,2173-56-0,CCCCCOC(=O)CCCC,0.686960,35395.0000
4,4726-93-6,O=C1CCCCC(=O)N1,0.750180,22586.0000
...,...,...,...,...
1924,3385-03-3,CC1(C)OC2CC3C4CC(F)C5=CC(=O)C=CC5(C)C4C(O)CC3(...,5.939000,0.4997
1925,2338-29-6,FC(F)(F)c1nc2c(Cl)c(Cl)c(Cl)c(Cl)c2[nH]1,6.121300,0.2435
1926,128606-48-4,CCOP(=S)(OCC)O/C(C)=C/C(=O)OC,6.282400,0.1399
1927,50585-41-6,Brc1cc2c(cc1Br)Oc1cc(Br)c(Br)cc1O2,6.698800,0.0992


 Convert a SMILES string to canonical SMILES

In [14]:
df_ts1 = deepcopy(df_ts)
df_ts1["SMILES"] = df_ts1.apply(lambda x: convert_smi_to_canon_smi(x.SMILES), axis=1)
df_ts1

Unnamed: 0,CAS_Number,SMILES,pLD50,"LD50,mg/kg"
0,7782-40-3,C,0.017765,15388.8000
1,2842-38-8,OCCNC1CCCCC1,0.572840,38274.0000
2,66257-53-2,NC(=O)C(=O)O,0.624490,21133.0000
3,2173-56-0,CCCCCOC(=O)CCCC,0.686960,35395.0000
4,4726-93-6,O=C1CCCCC(=O)N1,0.750180,22586.0000
...,...,...,...,...
1924,3385-03-3,CC1(C)OC2CC3C4CC(F)C5=CC(=O)C=CC5(C)C4C(O)CC3(...,5.939000,0.4997
1925,2338-29-6,FC(F)(F)c1nc2c(Cl)c(Cl)c(Cl)c(Cl)c2[nH]1,6.121300,0.2435
1926,128606-48-4,CCOP(=S)(OCC)OC(C)=CC(=O)OC,6.282400,0.1399
1927,50585-41-6,Brc1cc2c(cc1Br)Oc1cc(Br)c(Br)cc1O2,6.698800,0.0992


In [15]:
print('Original data: ', len(df_ts), 'molecules')
print('Failed data: ', len(df_ts1[df_ts1['SMILES']=='wrong_smiles']), 'molecules')

Original data:  1929 molecules
Failed data:  0 molecules


In [16]:
index=df_ts1.index[df_ts1['SMILES']=='wrong_smiles'].tolist()
wrong_smiles=df_ts.iloc[index]
wrong_smiles=wrong_smiles.SMILES
number=[x+1 for x in index]
bad_molecules = pd.DataFrame({'No. failed smiles in original set': number, 'SMILES of wrong structure: ': wrong_smiles}, index=None)
bad_molecules = bad_molecules.set_index('No. failed smiles in original set')
bad_molecules

Unnamed: 0_level_0,SMILES of wrong structure:
No. failed smiles in original set,Unnamed: 1_level_1


##  Standardization  for test set

In [17]:
df_ts1["Molecule"] = df_ts1.apply(lambda x: standart(x.SMILES), axis=1)
moldf_ts=df_ts1[df_ts1['SMILES']!='wrong_smiles']
print('Kept data: ', len(moldf_ts), 'molecules')

Kept data:  1929 molecules


In [18]:
moldf_ts

Unnamed: 0,CAS_Number,SMILES,pLD50,"LD50,mg/kg",Molecule
0,7782-40-3,C,0.017765,15388.8000,<rdkit.Chem.rdchem.Mol object at 0x0000018B316...
1,2842-38-8,OCCNC1CCCCC1,0.572840,38274.0000,<rdkit.Chem.rdchem.Mol object at 0x0000018B316...
2,66257-53-2,NC(=O)C(=O)O,0.624490,21133.0000,<rdkit.Chem.rdchem.Mol object at 0x0000018B316...
3,2173-56-0,CCCCCOC(=O)CCCC,0.686960,35395.0000,<rdkit.Chem.rdchem.Mol object at 0x0000018B316...
4,4726-93-6,O=C1CCCCC(=O)N1,0.750180,22586.0000,<rdkit.Chem.rdchem.Mol object at 0x0000018B316...
...,...,...,...,...,...
1924,3385-03-3,CC1(C)OC2CC3C4CC(F)C5=CC(=O)C=CC5(C)C4C(O)CC3(...,5.939000,0.4997,<rdkit.Chem.rdchem.Mol object at 0x0000018B315...
1925,2338-29-6,FC(F)(F)c1nc2c(Cl)c(Cl)c(Cl)c(Cl)c2[nH]1,6.121300,0.2435,<rdkit.Chem.rdchem.Mol object at 0x0000018B315...
1926,128606-48-4,CCOP(=S)(OCC)OC(C)=CC(=O)OC,6.282400,0.1399,<rdkit.Chem.rdchem.Mol object at 0x0000018B315...
1927,50585-41-6,Brc1cc2c(cc1Br)Oc1cc(Br)c(Br)cc1O2,6.698800,0.0992,<rdkit.Chem.rdchem.Mol object at 0x0000018B315...


In [19]:
y_ts=moldf_ts.pLD50
y_ts

0       0.017765
1       0.572840
2       0.624490
3       0.686960
4       0.750180
          ...   
1924    5.939000
1925    6.121300
1926    6.282400
1927    6.698800
1928    9.541100
Name: pLD50, Length: 1929, dtype: float64

In [20]:
df_ts = pd.DataFrame(moldf_ts, columns=["SMILES"])
df_ts

Unnamed: 0,SMILES
0,C
1,OCCNC1CCCCC1
2,NC(=O)C(=O)O
3,CCCCCOC(=O)CCCC
4,O=C1CCCCC(=O)N1
...,...
1924,CC1(C)OC2CC3C4CC(F)C5=CC(=O)C=CC5(C)C4C(O)CC3(...
1925,FC(F)(F)c1nc2c(Cl)c(Cl)c(Cl)c(Cl)c2[nH]1
1926,CCOP(=S)(OCC)OC(C)=CC(=O)OC
1927,Brc1cc2c(cc1Br)Oc1cc(Br)c(Br)cc1O2


In [50]:
df_ts.to_csv('datasets/molecule_ts.smi', sep=',', index=False, header=False)

# Calculation Fingerprints for work set

# 6.Descriptor calculation for work set

In [54]:
fp

{'AtomPairs2DCount': 'fingerprints_xml\\AtomPairs2DFingerprintCount.xml',
 'AtomPairs2D': 'fingerprints_xml\\AtomPairs2DFingerprinter.xml',
 'EState': 'fingerprints_xml\\EStateFingerprinter.xml',
 'CDKextended': 'fingerprints_xml\\ExtendedFingerprinter.xml',
 'CDK': 'fingerprints_xml\\Fingerprinter.xml',
 'CDKgraphonly': 'fingerprints_xml\\GraphOnlyFingerprinter.xml',
 'KlekotaRothCount': 'fingerprints_xml\\KlekotaRothFingerprintCount.xml',
 'KlekotaRoth': 'fingerprints_xml\\KlekotaRothFingerprinter.xml',
 'MACCS': 'fingerprints_xml\\MACCSFingerprinter.xml',
 'PubChem': 'fingerprints_xml\\PubchemFingerprinter.xml',
 'SubstructureCount': 'fingerprints_xml\\SubstructureFingerprintCount.xml',
 'Substructure': 'fingerprints_xml\\SubstructureFingerprinter.xml'}

## PubChem FPs

In [55]:
from padelpy import padeldescriptor

fingerprint = 'PubChem'

fingerprint_output_file = ''.join([fingerprint,'.csv']) #Substructure.csv
fingerprint_descriptortypes = fp[fingerprint]

padeldescriptor(mol_dir='datasets/molecule_ws.smi', 
                d_file=fingerprint_output_file, #'Substructure.csv'
                #descriptortypes='SubstructureFingerprint.xml', 
                descriptortypes= fingerprint_descriptortypes,
                detectaromaticity=True,
                standardizenitro=True,
                standardizetautomers=True,
                threads=2,
                removesalt=True,
                log=True,
                fingerprints=True)

In [21]:
descriptors_PubChem = pd.read_csv('PubChem.csv')

In [22]:
descriptors_PubChem

Unnamed: 0,Name,PubchemFP0,PubchemFP1,PubchemFP2,PubchemFP3,PubchemFP4,PubchemFP5,PubchemFP6,PubchemFP7,PubchemFP8,...,PubchemFP871,PubchemFP872,PubchemFP873,PubchemFP874,PubchemFP875,PubchemFP876,PubchemFP877,PubchemFP878,PubchemFP879,PubchemFP880
0,AUTOGEN_molecule_ws_1,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,AUTOGEN_molecule_ws_2,1,1,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,AUTOGEN_molecule_ws_3,1,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,AUTOGEN_molecule_ws_4,1,1,1,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,AUTOGEN_molecule_ws_5,1,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7707,AUTOGEN_molecule_ws_7708,1,1,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
7708,AUTOGEN_molecule_ws_7709,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
7709,AUTOGEN_molecule_ws_7710,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
7710,AUTOGEN_molecule_ws_7711,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [23]:
x_tr = descriptors_PubChem.drop('Name', axis=1)

In [24]:
x_tr = np.array(x_tr, dtype=np.float32)
y_tr = np.array(y_tr, dtype=np.float32)

In [62]:
savetxt('models/Padels/x_tr_PubChem.csv', x_tr, delimiter=',')

In [25]:
x_tr.shape

(7712, 881)

# 7.Descriptor calculation for test set

In [64]:
from padelpy import padeldescriptor

fingerprint = 'PubChem'

fingerprint_output_file = ''.join([fingerprint,'_ts.csv']) #PubChem.csv
fingerprint_descriptortypes = fp[fingerprint]

padeldescriptor(mol_dir='datasets/molecule_ts.smi', 
                d_file=fingerprint_output_file, #'PubChem.csv'
                #descriptortypes='PubChem.xml', 
                descriptortypes= fingerprint_descriptortypes,
                detectaromaticity=True,
                standardizenitro=True,
                standardizetautomers=True,
                threads=2,
                removesalt=True,
                log=False,
                fingerprints=True)

In [26]:
descriptors_PubChem_ts = pd.read_csv('PubChem_ts.csv')

In [27]:
x_ts = descriptors_PubChem_ts.drop('Name', axis=1)

In [28]:
x_ts.shape

(1929, 881)

# load the models from disk

In [29]:
best_svm = pickle.load(open('models/Padels/Toxicity_SVM_PubChem.pkl', 'rb'))

In [30]:
best_gbr = pickle.load(open('Models/Padels/Toxicitu_GBR_PubChem.pkl', 'rb'))

In [31]:
best_MLPR = pickle.load(open('Models/Padels/Toxicity_MLPR_PubChem.pkl', 'rb'))

# Prediction for CV

In [32]:
y_pred_ws_GBR = best_gbr.predict(x_tr)

In [33]:
y_pred_ws_svm = best_svm.predict(x_tr)

In [37]:
y_pred_ws_MLPR = best_MLPR.predict(x_tr)

In [38]:
y_pred_con=(y_pred_ws_GBR+y_pred_ws_svm+y_pred_ws_MLPR)/3

In [39]:
R2_WS = round(r2_score(y_tr, y_pred_con), 2)
R2_WS

0.94

In [40]:
RMSE_WS=round(np.sqrt(mean_absolute_error(y_tr, y_pred_con)), 2)
RMSE_WS

0.41

In [41]:
seed = 42
cv=KFold(n_splits=5, random_state=seed, shuffle=True)

In [42]:
y_pred_CV_svm = cross_val_predict(best_svm, x_tr, y_tr, cv=cv)

In [43]:
y_pred_CV_gbr = cross_val_predict(best_gbr, x_tr, y_tr, cv=cv)

In [44]:
y_pred_CV_MLPR = cross_val_predict(best_MLPR, x_tr, y_tr, cv=cv)

# For all models

# three models: svm+gbr+MLPR

In [45]:
y_pred_con=(y_pred_CV_svm+y_pred_CV_gbr+y_pred_CV_MLPR)/3

In [46]:
Q2_CV = round(r2_score(y_tr, y_pred_con), 2)
Q2_CV

0.59

In [47]:
RMSE_CV=round(np.sqrt(mean_absolute_error(y_tr, y_pred_con)),2)
RMSE_CV

0.64

# two models: svm+gbr

In [48]:
y_pred_con_without_MLPR=(y_pred_CV_svm+y_pred_CV_gbr)/2

In [49]:
Q2_CV = round(r2_score(y_tr, y_pred_con_without_MLPR), 2)
Q2_CV

0.59

In [50]:
RMSE_CV=round(np.sqrt(mean_absolute_error(y_tr, y_pred_con_without_MLPR)),2)
RMSE_CV

0.64

# Prediction for test set's molecules

In [51]:
x_ts = np.array(x_ts, dtype=np.float32)
y_ts = np.array(y_ts, dtype=np.float32)

In [52]:
y_pred_svm = best_svm.predict(x_ts)

In [53]:
y_pred_gbr = best_gbr.predict(x_ts)

In [54]:
y_pred_MLPR = best_MLPR.predict(x_ts)

In [55]:
y_pred_con=(y_pred_svm+y_pred_gbr+y_pred_MLPR)/3

In [56]:
Q2_TS = round(r2_score(y_ts, y_pred_con), 2)
Q2_TS

0.61

In [58]:
RMSE_TS=round(np.sqrt(mean_absolute_error(y_ts, y_pred_con)), 2)
RMSE_TS

0.62

# Estimating applicability domain. Method - Euclidian distances, K=1

In [59]:
neighbors_k= pairwise_distances(x_tr, n_jobs=-1)
neighbors_k.sort(0)

In [60]:
df_tr=pd.DataFrame(neighbors_k)
df_tr

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,7702,7703,7704,7705,7706,7707,7708,7709,7710,7711
0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
1,2.449490,4.472136,1.000000,3.000000,1.414214,2.645751,3.000000,3.741657,0.000000,1.000000,...,1.414214,1.732051,2.449490,4.582576,2.236068,3.316625,4.898980,3.000000,2.828427,3.316625
2,3.162278,4.582576,1.732051,3.872983,1.414214,3.000000,3.000000,4.123106,0.000000,2.236068,...,1.414214,2.000000,3.316625,5.196152,4.472136,3.605551,4.898980,3.162278,3.162278,3.872983
3,3.316625,4.690416,2.000000,4.123106,2.000000,3.162278,3.464102,4.242640,0.000000,2.236068,...,2.449490,3.000000,5.744563,5.385165,4.582576,4.358899,4.898980,3.605551,3.162278,4.795832
4,3.605551,4.795832,2.000000,4.242640,2.000000,3.316625,3.605551,4.242640,1.000000,2.236068,...,2.828427,3.000000,6.403124,5.477226,4.898980,5.000000,5.000000,5.196152,3.464102,5.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7707,14.798649,13.892444,15.165751,14.282857,14.933185,14.594520,14.798649,14.525839,14.456832,15.652476,...,14.832397,15.066519,14.594520,14.966630,14.832397,14.966630,13.892444,14.525839,14.594520,14.282857
7708,14.832397,14.106736,15.362291,14.491377,15.000000,14.730920,15.000000,14.832397,15.033297,15.842979,...,14.899665,15.165751,14.628738,15.099669,14.866069,15.099669,13.928389,14.832397,14.628738,14.282857
7709,15.427249,14.177447,15.779734,14.525839,15.684387,15.099669,15.033297,14.866069,15.132746,16.000000,...,15.459625,15.874508,14.662878,15.362291,15.297058,15.198684,13.964240,14.933185,15.264338,14.317822
7710,15.524175,14.456832,16.062378,14.933185,15.716233,15.198684,15.198684,15.264338,15.556349,16.522711,...,15.842979,15.968719,14.662878,15.394804,15.716233,15.198684,14.106736,15.099669,15.652476,14.594520


In [61]:
similarity= neighbors_k

In [62]:
Dmean=np.mean(similarity[1,:])

In [63]:
round(Dmean, 2)

2.83

In [64]:
std=np.std(similarity[1,:])

In [65]:
round(std, 2)

1.58

In [66]:
model_AD_limit=Dmean+std*0.5
print(np.round(model_AD_limit, 2))

3.62


In [67]:
neighbors_k_ts= pairwise_distances(x_tr,Y=x_ts, n_jobs=-1)
neighbors_k_ts.sort(0)

In [68]:
x_ts_AD=pd.DataFrame(neighbors_k_ts)
x_ts_AD

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,1919,1920,1921,1922,1923,1924,1925,1926,1927,1928
0,1.732051,2.645751,3.464102,1.000000,2.645751,1.732051,1.000000,1.732051,2.236068,3.464102,...,3.605551,4.242640,1.414214,1.732051,1.000000,1.000000,1.732051,2.000000,5.385165,3.605551
1,2.000000,3.316625,3.464102,1.414214,3.741657,1.732051,1.414214,2.000000,2.236068,3.605551,...,4.358899,5.291502,3.316625,3.162278,1.414214,2.449490,2.828427,2.449490,5.744563,3.741657
2,2.000000,3.316625,4.000000,1.732051,3.741657,2.000000,1.414214,2.236068,2.449490,4.123106,...,4.358899,5.291502,3.464102,3.316625,1.732051,2.645751,3.000000,3.316625,5.916080,4.242640
3,2.236068,3.464102,4.000000,1.732051,4.000000,2.236068,1.732051,2.449490,2.449490,4.242640,...,4.690416,6.244998,4.000000,3.464102,2.000000,3.464102,4.123106,3.316625,5.916080,4.358899
4,2.236068,3.605551,4.123106,2.000000,4.242640,2.236068,2.828427,2.645751,2.449490,4.358899,...,6.708204,6.324555,4.242640,3.464102,2.236068,4.123106,4.242640,3.464102,5.916080,4.472136
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7707,15.394804,14.387495,14.933185,14.594520,14.730920,14.696939,14.106736,15.198684,14.933185,13.892444,...,15.132746,13.892444,15.165751,14.832397,15.491934,15.132746,15.198684,14.594520,14.525839,14.933185
7708,15.588457,14.456832,14.933185,14.933185,14.764823,14.798649,14.628738,15.394804,15.066519,14.071247,...,15.198684,14.000000,15.198684,14.899665,15.524175,15.198684,15.231546,14.966630,14.560220,15.000000
7709,16.000000,15.033297,15.362291,15.362291,15.132746,15.524175,14.798649,15.748015,15.748015,14.071247,...,15.198684,14.212670,15.198684,15.231546,16.062378,15.329710,15.231546,15.264338,15.198684,15.099669
7710,16.278820,15.132746,15.716233,15.588457,15.491934,15.620500,15.099669,16.031219,15.779734,14.491377,...,15.231546,14.456832,15.297058,15.394804,16.401220,15.329710,15.329710,15.556349,15.588457,15.264338


In [69]:
similarity_ts= neighbors_k_ts
cpd_AD=similarity_ts[0,:]
cpd_value = np.round(cpd_AD, 3)
print(cpd_value)

[1.732 2.646 3.464 ... 2.    5.385 3.606]


In [70]:
cpd_AD = np.where(cpd_value <= model_AD_limit, True, False)
print(cpd_AD)

[ True  True  True ...  True False  True]


In [71]:
print("Coverage = ", sum(cpd_AD) / len(cpd_AD))

Coverage =  0.751684810782789


In [72]:
print("Indices of substances included in AD = ", np.where(cpd_AD != 0)[0])

Indices of substances included in AD =  [   0    1    2 ... 1925 1926 1928]


In [73]:
out_Ad=list(np.where(cpd_AD == 0)[0])

# Prediction only for molecules included in  AD

In [74]:
y_pred_con_ad=list(y_pred_con)

In [75]:
y_pred_con_ad[:] = [x for i,x in enumerate(y_pred_con_ad) if i not in out_Ad]

In [76]:
len(y_pred_con_ad)

1450

In [77]:
y_ts_ad=list(y_ts)

In [78]:
y_ts_ad[:] = [x for i,x in enumerate(y_ts_ad) if i not in out_Ad]

In [79]:
len(y_ts_ad)

1450

In [80]:
Q2_TS = round(r2_score(y_ts_ad, y_pred_con_ad), 2)
Q2_TS

0.66

In [81]:
RMSE_TS=round(np.sqrt(mean_absolute_error(y_ts_ad, y_pred_con_ad)), 2)
RMSE_TS

0.61