In [2]:
from warnings import filterwarnings
import time

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

import random
import time as time
from sklearn.ensemble import RandomForestClassifier, BaggingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import BernoulliNB
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier


from rdkit import Chem
from rdkit.Chem import MACCSkeys
from rdkit.Chem.AllChem import GetMorganFingerprintAsBitVect

# scipy hierarchy clustering
from scipy.cluster.hierarchy import dendrogram, linkage
from scipy.cluster.hierarchy import fcluster, cophenet
import scipy.cluster.hierarchy as sch
from scipy.spatial.distance import pdist

# SKlearn metrics
from sklearn.metrics import silhouette_samples, silhouette_score, adjusted_rand_score, adjusted_mutual_info_score

#### The Ph-fp of a compound is a vector containing compound's biological activities across the panel of assays predicted by corresponding ML classification models.
The Ph-fp is composed of 8 bioactivities predicted for the set of 174 NPS compounds and 1740 decoys random sampled from DUD-E dataset. 

In [122]:
def transform_fp(df):
    fp_list = []
    df_tmp = df.copy
    #df = df.astype(int)
    #df = df.astype(str)
    
    for x in df.columns:
        if x == "model1":
            fp_list = df[x]
        else:
            fp_list =  fp_list + df[x]
    
    return fp_list

##### ML Methods:
* SVM (SVC) 
* MLP
* RF
* KNN
* Bernoulli NB

##### hyperparameters
* The  determined using 10-fold cross-validated. 

##### Fingerprints used
* Morgan, or
* MACCS

In [3]:
# Set path
path = '/Users/hek/Research/Cheminformatics/Project_1_NPS/Stimulant vs. Hallucinogens/Dataset/Bioassay data/ChEMBL data/'
output_path = '/Users/hek/Research/Cheminformatics/Project_1_NPS/Stimulant vs. Hallucinogens/ChEMBL Dataset ML results/'
df_model_list = pd.read_csv(path+"Target - Assays - Model list and results.csv")

In [4]:
df_model_list.head(1)

Unnamed: 0,Target,Receptor family,Organism,UniProtKB,ChEMBL,Target_type,Activity_type,Nunique,N_actives_p5,N_actives_p6,...,RF_p6_n_morganfp,RF_p6_MCC_morganfp,RF_p7_n_morganfp,RF_p7_MCC_morganfp,RF_p5_n_maccsfp,RF_p5_MCC_maccsfp,RF_p6_n_maccsfp,RF_p6_MCC_maccsfp,RF_p7_n_maccsfp,RF_p7_MCC_maccsfp
0,CB1,cannabinoid,Homo sapiens,P21554,CHEMBL218,SINGLE PROTEIN,Ki,2710,2670,2025,...,180.0,0.935636,100.0,0.939861,180.0,0.933666,140.0,0.938051,60.0,0.929244


In [5]:
df_model_list["UniProtKB"].nunique()

70

In [None]:
target_count = 0
for i in range(df_model_list.shape[0]):
    if df_model_list.loc[i,'RF_p7_MCC_maccsfp'] >= 0.90:
        target_count +=1
print(target_count)

In [164]:
def load_ChEMBL_data(filename):
    df = pd.read_csv(path+filename,dtype=object)
    #print("Shape of dataframe: ", df.shape)
    return df

def get_training_X_Y(df,method="maccsfp"):
    Y = df.active
    total = []
    for i in range(df.shape[0]):
        b = [int(i) for i in list(df.loc[i,method])]
        total.append(b)
    X_df = pd.DataFrame(total)
    
    return X_df, Y

def get_NPS_X(df,method="maccsfp"):
    total = []
    for i in range(df.shape[0]):
        b = [int(i) for i in list(df.loc[i,method])]
        total.append(b)
    X_df = pd.DataFrame(total)
    
    return X_df

##### Load NPS set and calculate fingerprint 
* MACCS: "maccs"
* Morgan: "morgan

In [174]:
method = "maccsfp"

189 NPS compounds

In [175]:
NPS = pd.read_csv(output_path+"Drugs Raman or SERS in literature _ Paper 1.csv", dtype=object)
NPS_X= get_NPS_X(NPS,method="maccsfp")
print(NPS_X.shape)

(189, 166)


756 random sampled DUD-E decoys

In [167]:
NPS_decoy = pd.read_csv(output_path+"Decoys Drugs Raman or SERS in literature _ Paper 1.csv", dtype=object)
NPS_decoy_X = get_NPS_X(NPS_decoy,method="maccsfp")
print(NPS_decoy_X.shape)

(756, 166)


##### Check how many models are kept and confirm length of Ph-fp

In [188]:
model_count = 0
for i in range(df_model_list.shape[0]):
    if df_model_list.loc[i,'RF_p7_MCC_maccsfp'] >= 0.90:
        model_count +=1
print(model_count)

102


#### Load Training ChEMBL dataset
method = fingerprint used

In [178]:
method

'maccsfp'

In [189]:
tinit = time.time()
model_count = 0
NPS_pf_tmp = pd.DataFrame()
NPS_decoy_pf_tmp = pd.DataFrame()
for i in range(df_model_list.shape[0]):
#for i in range(0,20):
    target = df_model_list.loc[i,"Target"]
    UniProtKB = df_model_list.loc[i,'UniProtKB']
    Act_type = df_model_list.loc[i,'Activity_type'] 
    #print(target+'_'+UniProtKB+'_'+Act_type)
    if df_model_list.loc[i,'RF_p7_MCC_maccsfp'] >= 0.90:
        model_count +=1
        print("Training model",model_count)
        
        #load training dataset
        filename = "Final dataset "+"Actives >= 7.0 "+target+'_'+UniProtKB+'_'+Act_type+'.csv'
        df = load_ChEMBL_data(filename)
        print(filename)
        #Transform dataset
        X_df, Y = get_training_X_Y(df,method='maccsfp')
        #print(X_df.shape[0])
        
        #RF as base estimator
        RF =  RandomForestClassifier(n_estimators=int(df_model_list.loc[i,'RF_p7_n_maccsfp']),
                                     max_features=None,
                                     criterion = 'gini',random_state=13)
        
        # Bagging with RF as base estimator
        Bagging_model = BaggingClassifier(RF, n_estimators = 10, max_samples = 0.90,random_state = 42)
        print(Bagging_model)
        clf = Bagging_model.fit(X_df, Y)
        NPS_Y_pred = clf.predict(NPS_X)
        NPS_pf_tmp["model"+str(model_count)]=pd.to_numeric(NPS_Y_pred).astype(int).astype(str)
    
        NPS_decoy_Y_pred = clf.predict(NPS_decoy_X)
        NPS_decoy_pf_tmp["model"+str(model_count)]=pd.to_numeric(NPS_decoy_Y_pred).astype(int).astype(str)       
        
    else:
        print("skiped this model due to MCC < 0.90")
        continue

tfinal = time.time()
print("Total of models included in constructing the Ph-fp is ",model_count, "used", tfinal-tinit, 'seconds')

NPS_result = transform_fp(NPS_pf_tmp)
NPS_decoy_result = transform_fp(NPS_decoy_pf_tmp)

Training model 1
Final dataset Actives >= 7.0 CB1_P21554_Ki.csv
BaggingClassifier(base_estimator=RandomForestClassifier(max_features=None,
                                                        n_estimators=60,
                                                        random_state=13),
                  max_samples=0.9, random_state=42)
Training model 2
Final dataset Actives >= 7.0 CB1_P21554_EC50.csv
BaggingClassifier(base_estimator=RandomForestClassifier(max_features=None,
                                                        n_estimators=180,
                                                        random_state=13),
                  max_samples=0.9, random_state=42)
Training model 3
Final dataset Actives >= 7.0 CB1_P21554_IC50.csv
BaggingClassifier(base_estimator=RandomForestClassifier(max_features=None,
                                                        n_estimators=140,
                                                        random_state=13),
                  max_samples=0.

Training model 27
Final dataset Actives >= 7.0 5HT3A_P46098_IC50.csv
BaggingClassifier(base_estimator=RandomForestClassifier(max_features=None,
                                                        n_estimators=20,
                                                        random_state=13),
                  max_samples=0.9, random_state=42)
Training model 28
Final dataset Actives >= 7.0 5HT3A_P35563_Ki.csv
BaggingClassifier(base_estimator=RandomForestClassifier(max_features=None,
                                                        n_estimators=140,
                                                        random_state=13),
                  max_samples=0.9, random_state=42)
Training model 29
Final dataset Actives >= 7.0 5HT5A_P47898_Ki.csv
BaggingClassifier(base_estimator=RandomForestClassifier(max_features=None,
                                                        random_state=13),
                  max_samples=0.9, random_state=42)
Training model 30
Final dataset Actives >= 7.0 

Training model 52
Final dataset Actives >= 7.0 D4_P30729_Ki.csv
BaggingClassifier(base_estimator=RandomForestClassifier(max_features=None,
                                                        n_estimators=20,
                                                        random_state=13),
                  max_samples=0.9, random_state=42)
Training model 53
Final dataset Actives >= 7.0 D5_P21918_Ki.csv
BaggingClassifier(base_estimator=RandomForestClassifier(max_features=None,
                                                        n_estimators=20,
                                                        random_state=13),
                  max_samples=0.9, random_state=42)
Training model 54
Final dataset Actives >= 7.0 DAT_Q01959_Ki.csv
BaggingClassifier(base_estimator=RandomForestClassifier(max_features=None,
                                                        n_estimators=140,
                                                        random_state=13),
                  max_samples=0.9, r

Training model 76
Final dataset Actives >= 7.0 kappa_P41145_Ki.csv
BaggingClassifier(base_estimator=RandomForestClassifier(max_features=None,
                                                        random_state=13),
                  max_samples=0.9, random_state=42)
Training model 77
Final dataset Actives >= 7.0 kappa_P41145_IC50.csv
BaggingClassifier(base_estimator=RandomForestClassifier(max_features=None,
                                                        n_estimators=140,
                                                        random_state=13),
                  max_samples=0.9, random_state=42)
Training model 78
Final dataset Actives >= 7.0 kappa_P41145_EC50.csv
BaggingClassifier(base_estimator=RandomForestClassifier(max_features=None,
                                                        random_state=13),
                  max_samples=0.9, random_state=42)
skiped this model due to MCC < 0.90
Training model 79
Final dataset Actives >= 7.0 kappa_P33534_Ki.csv
BaggingClassifi

Training model 101
Final dataset Actives >= 7.0 GABAalpha2_P47869_Ki.csv
BaggingClassifier(base_estimator=RandomForestClassifier(max_features=None,
                                                        random_state=13),
                  max_samples=0.9, random_state=42)


  NPS_pf_tmp["model"+str(model_count)]=pd.to_numeric(NPS_Y_pred).astype(int).astype(str)
  NPS_decoy_pf_tmp["model"+str(model_count)]=pd.to_numeric(NPS_decoy_Y_pred).astype(int).astype(str)


Training model 102
Final dataset Actives >= 7.0 GABAalpha5_P31644_Ki.csv
BaggingClassifier(base_estimator=RandomForestClassifier(max_features=None,
                                                        n_estimators=60,
                                                        random_state=13),
                  max_samples=0.9, random_state=42)
Total of models included in constructing the Ph-fp is  102 used 788.2086019515991 seconds


  NPS_pf_tmp["model"+str(model_count)]=pd.to_numeric(NPS_Y_pred).astype(int).astype(str)
  NPS_decoy_pf_tmp["model"+str(model_count)]=pd.to_numeric(NPS_decoy_Y_pred).astype(int).astype(str)


In [184]:
NPS['RF_p6_maccs'] = NPS_result
NPS_decoy['RF_p6_maccs'] = NPS_decoy_result

In [185]:
NPS.head(2)

Unnamed: 0,Name,Other name,Formula,MW,CAS,PubChem CID,RotBondCount,Conformers,Canonical SMILES,Pharm class,...,canonical_smiles,maccsfp,morganfp,RF_morgan,MCS_K,RF_p5_morgan,RF_p6_morgan,RF_p7_morgan,RF_p5_maccs,RF_p6_maccs
0,Heroin,,C21H23NO5,369.4,561-27-3,5462328,4,9.0,CC(=O)OC1C=CC2C3CC4=C5C2(C1OC5=C(C=C4)OC(=O)C)...,Depressants_Opioids,...,CC(=O)Oc1ccc2c3c1OC1C(OC(C)=O)C=CC4C(C2)N(C)CC...,0000000000000000000000000000000000000000000000...,0000000000010001000000000000000001001000000000...,0000000000000000000000000000000000000000000000...,3,0000000000000000100000000000000000000001000110...,0000000000000000001000000000000000000001001110...,0000000000000100010000000000000000001000000000...,0000000000001011010101110000100010010101110111...,0000000000001010001010000000100000010001110111...
1,Morphine,,C17H19NO3,285.34,57-27-2,5288826,0,1.0,CN1CCC23C4C1CC5=C2C(=C(C=C5)O)OC3C(C=C4)O,Depressants_Opioids,...,CN1CCC23c4c5ccc(O)c4OC2C(O)C=CC3C1C5,0000000000000000000000000000000000000000000000...,0000000000000001000000000000000001001000000000...,0000000000000000000000000000000000000000000000...,3,0000000000001010100001000000000000001101111111...,0000000000100100001001000010000000000001111111...,0000000000000100000000000000000000001111001110...,1100000010011111110101111110110010010111111111...,0000000000000111001110110100110010010111111111...


In [186]:
NPS.to_csv(output_path+"Drugs Raman or SERS in literature _ Paper 1.csv",index = False)
NPS_decoy.to_csv(output_path+"Decoys Drugs Raman or SERS in literature _ Paper 1.csv",index = False)

##### BNB as baseline prediction

In [13]:
BNB = BernoulliNB(alpha= 0.01, fit_prior= True)
Bagging_model = BaggingClassifier(base_estimator = BNB, n_estimators = 50, max_samples = 0.90,random_state = 42) 

NPS_pf_tmp = pd.DataFrame()
NPS_decoy_pf_tmp = pd.DataFrame()
tinit = time.time()
for i in range(len(targets)):
        print("target:", targets[i])
        X = target_X[i]
        Y = target_Y[i]
        #print(X.shape)
        clf = Bagging_model.fit(X, Y)
        NPS_Y_pred = clf.predict(NPS_X)
        NPS_pf_tmp[targets[i]] = NPS_Y_pred
        #print(len(NPS_Y_pred))
        NPS_decoy_Y_pred = clf.predict(NPS_decoy_X)
        NPS_decoy_pf_tmp[targets[i]] = NPS_decoy_Y_pred
        #print(len(NPS_decoy_Y_pred))
        
tfinal = time.time()
print(tfinal-tinit, 'seconds')
# Transform prediction for all targets as binatry vector - Phfp
NPS['Phfp_BNB_'+method] = transform_fp(NPS_pf_tmp)
NPS_decoy['Phfp_BNB_'+method] = transform_fp(NPS_decoy_pf_tmp)

target: CB1
target: CB2
target: HT2A
target: HT2C
target: mu
target: NET
target: DAT
target: SERT
45.052056074142456 seconds


#### Phfp prediction using other algorithms, where hyperparameters are different for each target

In [14]:
# MACCS model hyperparameters
SVM_MACCS = (SVC(C=16,gamma=0.0625, random_state=13),
      SVC(C=4,gamma=0.0625, random_state=13),
      SVC(C=4,gamma=0.125, random_state=13),
      SVC(C=4,gamma=0.0625, random_state=13),
      SVC(C=4,gamma=0.0625, random_state=13),
      SVC(C=4,gamma=0.0625, random_state=13),
      SVC(C=16,gamma=0.03125, random_state=13),
      SVC(C=4,gamma=0.0625, random_state=13))
RF_MACCS = (RandomForestClassifier(n_estimators=1460, criterion = 'entropy',random_state=13),
     RandomForestClassifier(n_estimators=910, criterion = 'entropy',random_state=13),
     RandomForestClassifier(n_estimators=960, random_state=13),
     RandomForestClassifier(n_estimators=160, random_state=13),
     RandomForestClassifier(n_estimators=410, random_state=13),
     RandomForestClassifier(n_estimators=360, criterion = 'entropy',random_state=13),
     RandomForestClassifier(n_estimators=360, criterion = 'entropy',random_state=13),
     RandomForestClassifier(n_estimators=410, criterion = 'entropy',random_state=13))
KNN_MACCS = (KNeighborsClassifier(n_neighbors = 1),
       KNeighborsClassifier(n_neighbors = 5),
       KNeighborsClassifier(n_neighbors = 5),
       KNeighborsClassifier(n_neighbors = 7),
       KNeighborsClassifier(n_neighbors = 5),
       KNeighborsClassifier(n_neighbors = 5),
       KNeighborsClassifier(n_neighbors = 5),
       KNeighborsClassifier(n_neighbors = 5))
MLP_MACCS = (MLPClassifier(hidden_layer_sizes = (100,),activation='tanh',alpha=0.001, max_iter = 400),
       MLPClassifier(hidden_layer_sizes = (100,),activation='tanh',alpha=0.01, max_iter = 400),
      MLPClassifier(hidden_layer_sizes = (50,),activation='tanh',alpha=0.001, max_iter = 400),
      MLPClassifier(hidden_layer_sizes = (100,),activation='tanh',alpha=0.001, max_iter = 400),
      MLPClassifier(hidden_layer_sizes = (30,),activation='tanh',alpha=0.01, max_iter = 400),
      MLPClassifier(hidden_layer_sizes = (50,),activation='tanh',alpha=0.001, max_iter = 400),
      MLPClassifier(hidden_layer_sizes = (50,),activation='tanh',alpha=0.001, max_iter = 400),
      MLPClassifier(hidden_layer_sizes = (100,),activation='tanh',alpha=0.001, max_iter = 400))

In [15]:
def Phfp_prediction(models_target):
    NPS_pf_tmp = pd.DataFrame()
    NPS_decoy_pf_tmp = pd.DataFrame()
    tinit = time.time()
    for i in range(len(targets)):
        print("target:", targets[i])
        X = target_X[i]
        Y = target_Y[i]
        #print(X.shape)
        base_estimator = models_target[i]
        Bagging_model = BaggingClassifier(base_estimator = base_estimator, n_estimators = 50, max_samples = 0.90,random_state = 42)
        print(Bagging_model)
        clf = Bagging_model.fit(X, Y)
        
        NPS_Y_pred = clf.predict(NPS_X)
        NPS_pf_tmp[targets[i]] = NPS_Y_pred
        #print(len(NPS_Y_pred))
        NPS_decoy_Y_pred = clf.predict(NPS_decoy_X)
        NPS_decoy_pf_tmp[targets[i]] = NPS_decoy_Y_pred
        #print(len(NPS_decoy_Y_pred))
    
    tfinal = time.time()
    print(tfinal-tinit, 'seconds')
    # Transform prediction for all targets as binatry vector - Phfp
    NPS_pf = transform_fp(NPS_pf_tmp)
    NPS_decoy_pf = transform_fp(NPS_decoy_pf_tmp)
    print("Done Phfp prediciton with: "+str(models_target))
    return NPS_pf, NPS_decoy_pf

In [29]:
NPS['Phfp_SVM_'+method], NPS_decoy['Phfp_SVM_'+method] = Phfp_prediction(SVM_MACCS)
NPS['Phfp_RF_'+method], NPS_decoy['Phfp_RF_'+method] = Phfp_prediction(RF_MACCS)
NPS['Phfp_KNN_'+method], NPS_decoy['Phfp_KNN_'+method] = Phfp_prediction(KNN_MACCS)
NPS['Phfp_MLP_'+method], NPS_decoy['Phfp_MLP_'+method] = Phfp_prediction(MLP_MACCS)

target: CB1
BaggingClassifier(base_estimator=SVC(C=16, gamma=0.0625, random_state=13),
                  max_samples=0.9, n_estimators=50, random_state=42)


KeyboardInterrupt: 

In [19]:
NPS.tail(10)

Unnamed: 0,Name,Other name,Formula,MW,CAS,PubChem CID,RotBondCount,Conformers,Canonical SMILES,Pharm class,...,Canonical smiles,MCS_K,maccsfp,morganfp,fp,Phfp_BNB_maccs,Phfp_SVM_maccs,Phfp_RF_maccs,Phfp_KNN_maccs,Phfp_MLP_maccs
164,MiPT,N-(2-(1H-Indol-3-yl)ethyl)-N-methylpropan-2-amine,C14H20N2,216.32,96096-52-5,29935323,4,10,CC(C)N(C)CCC1=CNC2=CC=CC=C21,Serotonergic psychedelics,...,CC(C)N(C)CCc1c[nH]c2ccccc12,17,0000000000000000000000000000000000000000000000...,0100000000000000000000000000000000000000000000...,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",11111111,110111,110111,1111111,110111
165,4-HO-DiPT,3-[2-[di(propan-2-yl)amino]ethyl]-1H-indol-4-ol,C16H24N2O,260.37,132328-45-1,21854225,5,9,CC(C)N(CCC1=CNC2=C1C(=CC=C2)O)C(C)C,Serotonergic psychedelics,...,CC(C)N(CCc1c[nH]c2cccc(O)c12)C(C)C,17,0000000000000000000000000000000000000000000000...,0100000000000000000000000000000000000000000000...,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",11011111,110010,110000,1110001,110000
166,4-HO-MET,3-[2-[ethyl(methyl)amino]ethyl]-1H-indol-4-ol,C13H18N2O,218.29,77872-41-4,21786582,4,9,CCN(C)CCC1=CNC2=C1C(=CC=C2)O,Serotonergic psychedelics,...,CCN(C)CCc1c[nH]c2cccc(O)c12,17,0000000000000000000000000000000000000000000000...,0000000000000000000000000000000000000000000000...,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",11111111,110111,110101,1111111,1110111
167,4-HO-DET,3-(2-(Diethylamino)ethyl)-1H-indol-4-ol,C14H20N2O,232.32,22204-89-3,9991554,5,10,CCN(CC)CCC1=CNC2=C1C(=CC=C2)O,Serotonergic psychedelics,...,CCN(CC)CCc1c[nH]c2cccc(O)c12,17,0000000000000000000000000000000000000000000000...,0000000000000000000000000010000000000000000000...,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",11111111,110111,110001,1111101,110111
168,4-HO-MiPT,4-Hydroxy-N-methyl-N-isopropyltryptamine,C14H20N2O,232.32,77872-43-6,10082683,4,10,CC(C)N(C)CCC1=CNC2=C1C(=CC=C2)O,Serotonergic psychedelics,...,CC(C)N(C)CCc1c[nH]c2cccc(O)c12,17,0000000000000000000000000000000000000000000000...,0100000000000000000000000000000000000000000000...,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",11111111,110111,110101,1111111,110111
169,5-MeO-AMT,1-(5-methoxy-1H-indol-3-yl)propan-2-amine,C12H16N2O,204.27,1137-04-8,36906,3,7,CC(CC1=CNC2=C1C=C(C=C2)OC)N,Serotonergic psychedelics,...,COc1ccc2[nH]cc(CC(C)N)c2c1,15,0000000000000000000000000000000000000000000000...,0100000000000000000000000000000000000000000000...,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",10111,110111,110000,110110,110111
170,5-MeO-MiPT,N-[2-(5-methoxy-1H-indol-3-yl)ethyl]-N-methylp...,C15H22N2O,246.35,96096-55-8,2763156,5,7,CC(C)N(C)CCC1=CNC2=C1C=C(C=C2)OC,Serotonergic psychedelics,...,COc1ccc2[nH]cc(CCN(C)C(C)C)c2c1,17,0000000000000000000000000000000000000000000000...,0100000000000000000000000000000000000000000000...,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",11111111,111111,110111,1111111,110111
171,5-MeO-DMT,"5-Methoxy-N,N-dimethyltryptamine",C13H18N2O,218.29,1019-45-0,1832,4,10,CN(C)CCC1=CNC2=C1C=C(C=C2)OC,Serotonergic psychedelics,...,COc1ccc2[nH]cc(CCN(C)C)c2c1,15,0000000000000000000000000000000000000000000000...,0000000000000000000000000000000000000000000000...,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",11111111,110111,110111,1111111,110111
172,5-MeO-DiPT,"5-Methoxy-N,N-diisopropyltryptamine",C17H26N2O,274.4,4021-34-5,151182,6,4,CC(C)N(CCC1=CNC2=C1C=C(C=C2)OC)C(C)C,Serotonergic psychedelics,...,COc1ccc2[nH]cc(CCN(C(C)C)C(C)C)c2c1,17,0000000000000000000000000000000000000000000000...,0100000000000000000000000000000000000000000000...,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",11111111,110111,110101,1111111,110111
173,5-MeO-DET,"5-Methoxy-N,N-diethyltryptamine",C15H22N2O,246.35,1218-40-2,417608,6,10,CCN(CC)CCC1=CNC2=C1C=C(C=C2)OC,Serotonergic psychedelics,...,CCN(CC)CCc1c[nH]c2ccc(OC)cc12,17,0000000000000000000000000000000000000000000000...,0000000000000000000000000010000000000000000000...,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",11111111,1111111,110111,11111111,1111111


Drop RDKit mol and save results

In [32]:
NPS

Unnamed: 0,Name,Other name,Formula,MW,CAS,PubChem CID,RotBondCount,Conformers,Canonical SMILES,Pharm class,...,Canonical smiles,MCS_K,maccsfp,morganfp,fp,Phfp_BNB_maccs,Phfp_SVM_maccs,Phfp_RF_maccs,Phfp_KNN_maccs,Phfp_MLP_maccs
0,Heroin,,C21H23NO5,369.4,561-27-3,5462328,4,9,CC(=O)OC1C=CC2C3CC4=C5C2(C1OC5=C(C=C4)OC(=O)C)...,Sedatives,...,CC(=O)Oc1ccc2c3c1OC1C(OC(C)=O)C=CC4C(C2)N(C)CC...,3,0000000000000000000000000000000000000000000000...,0000000000000000000000000000000000000000000000...,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",11101111,00001000,00001000,01111001,00001000
1,Morphine,,C17H19NO3,285.34,57-27-2,5288826,0,1,CN1CCC23C4C1CC5=C2C(=C(C=C5)O)OC3C(C=C4)O,Sedatives,...,CN1CCC23c4c5ccc(O)c4OC2C(O)C=CC3C1C5,3,0000000000000000000000000000000000000000000000...,0000000000000000000000000000000000000000000000...,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",11111111,00111000,00101000,01101000,00111100
2,Acetylcodeine,6-Acetylcodiene,C20H23NO4,341.4,6703-27-1,5486550,3,3,CC(=O)OC1C=CC2C3CC4=C5C2(C1OC5=C(C=C4)OC)CCN3C,Sedatives,...,COc1ccc2c3c1OC1C(OC(C)=O)C=CC4C(C2)N(C)CCC341,3,0000000000000000000000000000000000000000000000...,0000000000000000000000000000000000000000000000...,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",11111111,00011000,00101000,01111011,11111000
3,Codeine,,C18H21NO3,299.4,76-57-3,5284371,1,1,CN1CCC23C4C1CC5=C2C(=C(C=C5)OC)OC3C(C=C4)O,Sedatives,...,COc1ccc2c3c1OC1C(O)C=CC4C(C2)N(C)CCC341,3,0000000000000000000000000000000000000000000000...,0000000000000000000000000000000000000000000000...,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",11111111,00111000,00101001,00111001,00101000
4,Hydrocodone,,C18H21NO3,299.4,125-29-1,5284569,1,1,CN1CCC23C4C1CC5=C2C(=C(C=C5)OC)OC3C(=O)CC4,Sedatives,...,COc1ccc2c3c1OC1C(=O)CCC4C(C2)N(C)CCC314,3,0000000000000000000000000000000000000000000000...,0000000000000000000000000000000000000000000000...,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",11111111,01111001,00101001,11111011,01111111
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
169,5-MeO-AMT,1-(5-methoxy-1H-indol-3-yl)propan-2-amine,C12H16N2O,204.27,1137-04-8,36906,3,7,CC(CC1=CNC2=C1C=C(C=C2)OC)N,Serotonergic psychedelics,...,COc1ccc2[nH]cc(CC(C)N)c2c1,15,0000000000000000000000000000000000000000000000...,0100000000000000000000000000000000000000000000...,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",00010111,00110111,00110000,00110110,00110111
170,5-MeO-MiPT,N-[2-(5-methoxy-1H-indol-3-yl)ethyl]-N-methylp...,C15H22N2O,246.35,96096-55-8,2763156,5,7,CC(C)N(C)CCC1=CNC2=C1C=C(C=C2)OC,Serotonergic psychedelics,...,COc1ccc2[nH]cc(CCN(C)C(C)C)c2c1,17,0000000000000000000000000000000000000000000000...,0100000000000000000000000000000000000000000000...,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",11111111,00111111,00110111,01111111,00110111
171,5-MeO-DMT,"5-Methoxy-N,N-dimethyltryptamine",C13H18N2O,218.29,1019-45-0,1832,4,10,CN(C)CCC1=CNC2=C1C=C(C=C2)OC,Serotonergic psychedelics,...,COc1ccc2[nH]cc(CCN(C)C)c2c1,15,0000000000000000000000000000000000000000000000...,0000000000000000000000000000000000000000000000...,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",11111111,00110111,00110111,01111111,00110111
172,5-MeO-DiPT,"5-Methoxy-N,N-diisopropyltryptamine",C17H26N2O,274.4,4021-34-5,151182,6,4,CC(C)N(CCC1=CNC2=C1C=C(C=C2)OC)C(C)C,Serotonergic psychedelics,...,COc1ccc2[nH]cc(CCN(C(C)C)C(C)C)c2c1,17,0000000000000000000000000000000000000000000000...,0100000000000000000000000000000000000000000000...,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",11111111,00110111,00110101,01111111,00110111


In [36]:
NPS.drop(["fp"],axis=1).to_csv(output_path+"Drugs Raman or SERS in literature _ Paper 1.csv",index=False)
NPS_decoy.drop(["fp"],axis=1).to_csv(output_path+"Inactives Drugs Raman or SERS in literature _ Paper 1.csv",index=False)