In [13]:
import pandas as pd
import numpy as np
import rdkit
import padelpy
import PaDEL_pywrapper
from PaDEL_pywrapper import PaDEL
from PaDEL_pywrapper.descriptor import WienerNumbers, ZagrebIndex, Topological, TopologicalCharge, VAdjMa, PetitjeanNumber
from rdkit import Chem
from rdkit.Chem import Descriptors
from sklearn.preprocessing import StandardScaler
from padelpy import from_smiles

In [2]:
input_file = '../TRPM8-bootcamp-project/1_preprocess/TRPM8-homosapien-compounds-activities-processed.csv'
df = pd.read_csv(input_file)
print(df.head())

  Molecule ChEMBL ID  Molecular Weight  #RO5 Violations  AlogP Compound Key  \
0      CHEMBL3235962            421.42              1.0   5.76           22   
1      CHEMBL3235983            434.36              1.0   5.45           44   
2      CHEMBL1650511            467.41              1.0   7.09            5   
3      CHEMBL2443068            438.83              1.0   5.39           9b   
4      CHEMBL3959823            358.44              0.0   3.86           9n   

                                              Smiles Standard Type  \
0  N#Cc1cccc(NC(=O)N2CCc3ccccc3[C@H]2c2ccc(C(F)(F...          IC50   
1  C[C@H](NC(=O)N1CCc2ccccc2[C@H]1c1ccc(C(F)(F)F)...          IC50   
2  FC(F)(F)c1ccccc1-c1cc(C(F)(F)F)c2[nH]c(C3=NOC4...          IC50   
3  O=C1CC2(CCN(C(=O)Nc3ccc(C(F)(F)F)cc3)CC2)Oc2c(...          IC50   
4  Cc1cccc(CN(C(=O)c2ccccc2)[C@@H](C(N)=O)c2ccccc...          IC50   

   Standard Value Standard Units  pChEMBL Value  ...  Action Type  \
0          83.000             nM   

In [3]:
column_names = df.columns
column_names

Index(['Molecule ChEMBL ID', 'Molecular Weight', '#RO5 Violations', 'AlogP',
       'Compound Key', 'Smiles', 'Standard Type', 'Standard Value',
       'Standard Units', 'pChEMBL Value', 'Data Validity Comment',
       'Ligand Efficiency BEI', 'Ligand Efficiency LE',
       'Ligand Efficiency LLE', 'Ligand Efficiency SEI', 'Action Type',
       'InChI', 'Potency', 'Molecular Weight_standardized',
       'AlogP_standardized', 'Standard Value_standardized',
       'pChEMBL Value_standardized', 'Ligand Efficiency LE_standardized',
       'Ligand Efficiency LLE_standardized',
       'Ligand Efficiency SEI_standardized'],
      dtype='object')

In [31]:
smiles_list = df["Smiles"].tolist()
smiles_list_chem = [Chem.MolFromSmiles(x) for x in smiles_list]


descriptors = [WienerNumbers, ZagrebIndex, Topological, TopologicalCharge, VAdjMa, PetitjeanNumber]
padel = PaDEL(descriptors)
padel_dict = padel.calculate(smiles_list_chem)
padel_df = pd.DataFrame.from_dict(padel_dict)

PaDEL-Descriptor is a software for calculating molecular
descriptors and fingerprints. The software calculates
1875 descriptors (1444 1D and 2D descriptors, and 431
3D descriptors) and 12 types of fingerprints.

###################################

Should you publish results based on the PaDEL descriptors,
please cite:

Yap, C.W. (2011), PaDEL-descriptor: An open source software
to calculate molecular descriptors and fingerprints.
J. Comput. Chem., 32: 1466-1474. https://doi.org/10.1002/jcc.21707

###################################





In [34]:
def calculate_topological_descriptors(smiles):
    m = Chem.MolFromSmiles(smiles)
    dict = {}
    #Trying to use a dictionary
    
    dict["chembl ID"] = df["Molecule ChEMBL ID"]
    dict["Smiles"] = df["Smiles"]
    dict["Standard Value"] = df["Standard Value"]
    dict["BalabanJ"] = Descriptors.BalabanJ(m)
    dict["TPSA"] = Descriptors.TPSA(m)
    dict["Ipc"] = Descriptors.Ipc(m)
    dict["HallKierAlpha"] = Descriptors.HallKierAlpha(m)
    dict["Kappa1"] = Descriptors.Kappa1(m)
    dict["Kappa2"] = Descriptors.Kappa2(m)
    dict["Kappa3"] = Descriptors.Kappa3(m)
    dict["Chi0"] = Descriptors.Chi0(m)
    dict["Chi1"] = Descriptors.Chi1(m)
    dict["Chi0n"] = Descriptors.Chi0n(m)
    dict["Chi1n"] = Descriptors.Chi1n(m)
    dict["Chi2n"] = Descriptors.Chi2n(m)
    dict["Chi3n"] = Descriptors.Chi3n(m)
    dict["Chi4n"] = Descriptors.Chi4n(m)
    dict["Chi0v"] = Descriptors.Chi0v(m)
    dict["Chi1v"] = Descriptors.Chi1v(m)
    dict["Chi2v"] = Descriptors.Chi2v(m)
    dict["Chi3v"] = Descriptors.Chi3v(m)
    dict["Chi4v"] = Descriptors.Chi4v(m)
    dict["PEOE_VSA1"] = Descriptors.PEOE_VSA1(m)
    dict["PEOE_VSA2"] = Descriptors.PEOE_VSA2(m)
    dict["PEOE_VSA3"] = Descriptors.PEOE_VSA3(m)
    dict["PEOE_VSA4"] = Descriptors.PEOE_VSA4(m)
    dict["PEOE_VSA5"] = Descriptors.PEOE_VSA5(m)
    dict["PEOE_VSA6"] = Descriptors.PEOE_VSA6(m)
    dict["PEOE_VSA7"] = Descriptors.PEOE_VSA7(m)
    dict["PEOE_VSA8"] = Descriptors.PEOE_VSA8(m)
    dict["PEOE_VSA9"] = Descriptors.PEOE_VSA9(m)
    dict["PEOE_VSA10"] = Descriptors.PEOE_VSA10(m)
    dict["PEOE_VSA11"] = Descriptors.PEOE_VSA11(m)
    dict["PEOE_VSA12"] = Descriptors.PEOE_VSA12(m)
    dict["PEOE_VSA13"] = Descriptors.PEOE_VSA13(m)
    dict["PEOE_VSA14"] = Descriptors.PEOE_VSA14(m)
    
    return dict

smiles_list = df["Smiles"].tolist()
rdk_topological_dict = [calculate_topological_descriptors(x) for x in smiles_list]
rdk_df = pd.DataFrame.from_dict(rdk_topological_dict)
topological_df = pd.concat([rdk_df, padel_df], axis = 1)
topological_df


Unnamed: 0,chembl ID,Smiles,Standard Value,BalabanJ,TPSA,Ipc,HallKierAlpha,Kappa1,Kappa2,Kappa3,...,JGI6,JGI7,JGI8,JGI9,JGI10,JGT,VAdjMat,WPATH,WPOL,Zagreb
0,0 CHEMBL3235962 1 CHEMBL3235983 2 ...,0 N#Cc1cccc(NC(=O)N2CCc3ccccc3[C@H]2c2ccc...,0 83.000 1 10.000 2 0.41...,1.690515,56.13,1.039014e+07,-3.79,20.481470,8.149739,4.237594,...,0.017088,0.016634,0.011785,0.008406,0.008588,0.482208,5.954196,2680.0,52.0,166.0
1,0 CHEMBL3235962 1 CHEMBL3235983 2 ...,0 N#Cc1cccc(NC(=O)N2CCc3ccccc3[C@H]2c2ccc...,0 83.000 1 10.000 2 0.41...,2.023307,32.34,2.699429e+06,-2.78,21.917604,7.806619,4.364645,...,0.023407,0.022724,0.018507,0.011464,0.012370,0.609510,5.906891,2349.0,52.0,162.0
2,0 CHEMBL3235962 1 CHEMBL3235983 2 ...,0 N#Cc1cccc(NC(=O)N2CCc3ccccc3[C@H]2c2ccc...,0 83.000 1 10.000 2 0.41...,1.612535,50.27,3.098475e+07,-3.04,21.787307,7.495135,3.954292,...,0.023434,0.019966,0.012460,0.010282,0.007505,0.579610,6.044394,3076.0,57.0,190.0
3,0 CHEMBL3235962 1 CHEMBL3235983 2 ...,0 N#Cc1cccc(NC(=O)N2CCc3ccccc3[C@H]2c2ccc...,0 83.000 1 10.000 2 0.41...,1.392979,58.64,4.780479e+06,-2.74,20.529456,7.501947,3.961512,...,0.012577,0.021862,0.014073,0.007778,0.009110,0.608409,5.906891,2757.0,52.0,166.0
4,0 CHEMBL3235962 1 CHEMBL3235983 2 ...,0 N#Cc1cccc(NC(=O)N2CCc3ccccc3[C@H]2c2ccc...,0 83.000 1 10.000 2 0.41...,2.046188,63.40,1.415232e+06,-3.40,18.392847,8.319859,4.416427,...,0.014694,0.008605,0.008151,0.006562,0.004938,0.378021,5.754888,1730.0,41.0,136.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
649,0 CHEMBL3235962 1 CHEMBL3235983 2 ...,0 N#Cc1cccc(NC(=O)N2CCc3ccccc3[C@H]2c2ccc...,0 83.000 1 10.000 2 0.41...,1.551374,57.26,7.802733e+05,-2.29,18.499572,7.527031,4.456538,...,0.023059,0.010995,0.013450,0.010074,0.007778,0.544606,5.700440,1961.0,37.0,136.0
650,0 CHEMBL3235962 1 CHEMBL3235983 2 ...,0 N#Cc1cccc(NC(=O)N2CCc3ccccc3[C@H]2c2ccc...,0 83.000 1 10.000 2 0.41...,2.335188,59.42,1.494366e+04,-1.59,12.856309,5.370257,2.626550,...,0.017755,0.017284,0.012818,0.009125,0.014437,0.511274,5.169925,637.0,25.0,90.0
651,0 CHEMBL3235962 1 CHEMBL3235983 2 ...,0 N#Cc1cccc(NC(=O)N2CCc3ccccc3[C@H]2c2ccc...,0 83.000 1 10.000 2 0.41...,2.020584,102.50,4.517308e+04,-2.42,11.658106,4.687837,2.147235,...,0.013835,0.010978,0.008736,0.005000,0.006870,0.408149,5.247928,725.0,26.0,100.0
652,0 CHEMBL3235962 1 CHEMBL3235983 2 ...,0 N#Cc1cccc(NC(=O)N2CCc3ccccc3[C@H]2c2ccc...,0 83.000 1 10.000 2 0.41...,2.332605,50.19,2.005623e+04,-1.75,13.676346,5.484467,3.172661,...,0.020375,0.018229,0.010254,0.014063,0.006173,0.535801,5.247928,731.0,26.0,96.0


In [35]:
#Standardize values

scaler = StandardScaler()
val_only_topological_df = topological_df.drop(["Smiles", "chembl ID", "Standard Value"], axis=1)

def scale(series):
    array = series.tolist()
    reshape = np.reshape(array, (-1, 1))
    scaler.fit(reshape)
    return scaler.transform(reshape).tolist()

standard_topological_df = val_only_topological_df.apply(scale)
final_topological_df = standard_topological_df.assign(
    Smiles=topological_df["Smiles"], 
    chembl_ID=topological_df["chembl ID"], 
    Standard_Value=topological_df["Standard Value"])
final_topological_df

Unnamed: 0,BalabanJ,TPSA,Ipc,HallKierAlpha,Kappa1,Kappa2,Kappa3,Chi0,Chi1,Chi0n,...,JGI9,JGI10,JGT,VAdjMat,WPATH,WPOL,Zagreb,Smiles,chembl_ID,Standard_Value
0,[-0.417214993177375],[-0.9975361492630956],[-0.10496707989299657],[-0.6127921571341065],[-0.24298347339498366],[-0.19663844696832664],[-0.17217798450740723],[-0.1067120903922174],[0.0037764700915145707],[-0.11689340693904678],...,[-0.2171615910466616],[0.41087895457179235],[-0.4376906133440387],[0.06628565694135109],[-0.24826519708904063],[-0.02522645165125391],[-0.04101197993277686],0 N#Cc1cccc(NC(=O)N2CCc3ccccc3[C@H]2c2ccc...,0 CHEMBL3235962 1 CHEMBL3235983 2 ...,0 83.000 1 10.000 2 0.41...
1,[0.7349589311384751],[-2.0537947437128024],[-0.10889794083741827],[0.6703707259642091],[0.07626756287584677],[-0.3857859434072677],[-0.08572921767581845],[-0.050193994618899486],[-0.33521869620095873],[-0.3500570659438507],...,[0.885910152996196],[1.801845413213441],[1.0576186901720104],[-0.07770296975623825],[-0.49061587947856167],[-0.02522645165125391],[-0.14873019228633486],0 N#Cc1cccc(NC(=O)N2CCc3ccccc3[C@H]2c2ccc...,0 CHEMBL3235962 1 CHEMBL3235983 2 ...,0 83.000 1 10.000 2 0.41...
2,[-0.6871921271069986],[-1.2577158618934146],[-0.09444080909080257],[0.3400515679388971],[0.04730279685573712],[-0.5574937508457232],[-0.3649438062021891],[0.20571279463649153],[0.24579681790300553],[0.17140622564102603],...,[0.4593135325821712],[0.012614668607169022],[0.7064140751606879],[0.34082877939071043],[0.04167700903256158],[0.3413979790136331],[0.605297294188571],0 N#Cc1cccc(NC(=O)N2CCc3ccccc3[C@H]2c2ccc...,0 CHEMBL3235962 1 CHEMBL3235983 2 ...,0 83.000 1 10.000 2 0.41...
3,[-1.4473270176779094],[-0.8860939856620549],[-0.10783427908514086],[0.7211890579681012],[-0.23231626589592966],[-0.5537385482957269],[-0.36003094644877004],[-0.2146633100313446],[-0.23914544772738525],[-0.3033392999079678],...,[-0.4438593849967802],[0.603189908355184],[1.0446856041377264],[-0.07770296975623825],[-0.19188754589872908],[-0.02522645165125391],[-0.04101197993277686],0 N#Cc1cccc(NC(=O)N2CCc3ccccc3[C@H]2c2ccc...,0 CHEMBL3235962 1 CHEMBL3235983 2 ...,0 83.000 1 10.000 2 0.41...
4,[0.8141760417617177],[-0.6747534682353797],[-0.10955431662291219],[-0.11731342009614414],[-0.7072824820093839],[-0.10285876625229295],[-0.05049578502087844],[-0.7182003963265425],[-0.6259603254010454],[-0.47627515175132773],...,[-0.8822252849002782],[-0.9312326123195876],[-1.6614928271866978],[-0.5403683534521518],[-0.9438336208656115],[-0.8318001991140053],[-0.8488985725844618],0 N#Cc1cccc(NC(=O)N2CCc3ccccc3[C@H]2c2ccc...,0 CHEMBL3235962 1 CHEMBL3235983 2 ...,0 83.000 1 10.000 2 0.41...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
649,[-0.8989405515309058],[-0.9473649760084437],[-0.10987885542116124],[1.2928952930119042],[-0.6835576590085997],[-0.5399105868931691],[-0.023203476033615753],[-0.8261516159656689],[-0.8817079113305956],[-0.9940253765997469],...,[0.38424695364446],[0.11326142877412088],[0.2952450647211107],[-0.7060959338267699],[-0.7747006672946769],[-1.125099743645915],[-0.8488985725844618],0 N#Cc1cccc(NC(=O)N2CCc3ccccc3[C@H]2c2ccc...,0 CHEMBL3235962 1 CHEMBL3235983 2 ...,0 83.000 1 10.000 2 0.41...
650,[1.814738323509257],[-0.8514625563358349],[-0.11027002898828528],[2.1822161030800435],[-1.938049269178776],[-1.7288481066696777],[-1.2683710687539231],[-2.058784396272863],[-2.1768314554698436],[-2.2591592392702142],...,[0.042100526896259115],[2.5622600310426136],[-0.09627715090107536],[-2.320870932116543],[-1.744103396852761],[-2.0049983772416438],[-2.0876580146503785],0 N#Cc1cccc(NC(=O)N2CCc3ccccc3[C@H]2c2ccc...,0 CHEMBL3235962 1 CHEMBL3235983 2 ...,0 83.000 1 10.000 2 0.41...
651,[0.725532090536183],[1.0612579249123097],[-0.11025457819671762],[1.1277357139992492],[-2.204408628298594],[-2.105036586455935],[-1.5945084292869967],[-2.0321415104188203],[-1.9459374639393885],[-2.130338781934454],...,[-1.4458385847761575],[-0.22077653590364493],[-1.3076088439449947],[-2.0834477236384825],[-1.679671795492405],[-1.9316734911086664],[-1.8183624837664836],0 N#Cc1cccc(NC(=O)N2CCc3ccccc3[C@H]2c2ccc...,0 CHEMBL3235962 1 CHEMBL3235983 2 ...,0 83.000 1 10.000 2 0.41...
652,[1.8057944837091744],[-1.2612678033627704],[-0.11026741586553296],[1.9789427750644686],[-1.7557558534738493],[-1.6658887612794917],[-0.8967840563065305],[-1.8676721950063762],[-2.052681739461566],[-1.8931443168839255],...,[1.8231185545041524],[-0.4771911409481179],[0.19181877413013929],[-2.0834477236384825],[-1.6752787317632898],[-1.9316734911086664],[-1.9260806961200416],0 N#Cc1cccc(NC(=O)N2CCc3ccccc3[C@H]2c2ccc...,0 CHEMBL3235962 1 CHEMBL3235983 2 ...,0 83.000 1 10.000 2 0.41...
