In [48]:
import pandas as pd
import numpy as np
import rdkit
from rdkit import Chem
from rdkit.Chem import Descriptors
from sklearn.preprocessing import StandardScaler

In [2]:
input_file = '../TRPM8-bootcamp-project/1_preprocess/TRPM8-homosapien-compounds-activities-processed.csv'
df = pd.read_csv(input_file)
print(df.head())

  Molecule ChEMBL ID  Molecular Weight  #RO5 Violations  AlogP Compound Key  \
0      CHEMBL3235962            421.42              1.0   5.76           22   
1      CHEMBL3235983            434.36              1.0   5.45           44   
2      CHEMBL1650511            467.41              1.0   7.09            5   
3      CHEMBL2443068            438.83              1.0   5.39           9b   
4      CHEMBL3959823            358.44              0.0   3.86           9n   

                                              Smiles Standard Type  \
0  N#Cc1cccc(NC(=O)N2CCc3ccccc3[C@H]2c2ccc(C(F)(F...          IC50   
1  C[C@H](NC(=O)N1CCc2ccccc2[C@H]1c1ccc(C(F)(F)F)...          IC50   
2  FC(F)(F)c1ccccc1-c1cc(C(F)(F)F)c2[nH]c(C3=NOC4...          IC50   
3  O=C1CC2(CCN(C(=O)Nc3ccc(C(F)(F)F)cc3)CC2)Oc2c(...          IC50   
4  Cc1cccc(CN(C(=O)c2ccccc2)[C@@H](C(N)=O)c2ccccc...          IC50   

   Standard Value Standard Units  pChEMBL Value  ...  Action Type  \
0          83.000             nM   

In [3]:
column_names = df.columns
column_names

Index(['Molecule ChEMBL ID', 'Molecular Weight', '#RO5 Violations', 'AlogP',
       'Compound Key', 'Smiles', 'Standard Type', 'Standard Value',
       'Standard Units', 'pChEMBL Value', 'Data Validity Comment',
       'Ligand Efficiency BEI', 'Ligand Efficiency LE',
       'Ligand Efficiency LLE', 'Ligand Efficiency SEI', 'Action Type',
       'InChI', 'Potency', 'Molecular Weight_standardized',
       'AlogP_standardized', 'Standard Value_standardized',
       'pChEMBL Value_standardized', 'Ligand Efficiency LE_standardized',
       'Ligand Efficiency LLE_standardized',
       'Ligand Efficiency SEI_standardized'],
      dtype='object')

In [3]:
def calculate_topological_descriptors(smiles):
    m = Chem.MolFromSmiles(smiles)
    dict = {}
    #Trying to use a dictionary
    
    dict["chembl ID"] = df["Molecule ChEMBL ID"]
    dict["Smiles"] = df["Smiles"]
    dict["Standard Value"] = df["Standard Value"]
    dict["BalabanJ"] = Descriptors.BalabanJ(m)
    dict["TPSA"] = Descriptors.TPSA(m)
    dict["Ipc"] = Descriptors.Ipc(m)
    dict["HallKierAlpha"] = Descriptors.HallKierAlpha(m)
    dict["Kappa1"] = Descriptors.Kappa1(m)
    dict["Kappa2"] = Descriptors.Kappa2(m)
    dict["Kappa3"] = Descriptors.Kappa3(m)
    dict["Chi0"] = Descriptors.Chi0(m)
    dict["Chi1"] = Descriptors.Chi1(m)
    dict["Chi0n"] = Descriptors.Chi0n(m)
    dict["Chi1n"] = Descriptors.Chi1n(m)
    dict["Chi2n"] = Descriptors.Chi2n(m)
    dict["Chi3n"] = Descriptors.Chi3n(m)
    dict["Chi4n"] = Descriptors.Chi4n(m)
    dict["Chi0v"] = Descriptors.Chi0v(m)
    dict["Chi1v"] = Descriptors.Chi1v(m)
    dict["Chi2v"] = Descriptors.Chi2v(m)
    dict["Chi3v"] = Descriptors.Chi3v(m)
    dict["Chi4v"] = Descriptors.Chi4v(m)
    dict["PEOE_VSA1"] = Descriptors.PEOE_VSA1(m)
    dict["PEOE_VSA2"] = Descriptors.PEOE_VSA2(m)
    dict["PEOE_VSA3"] = Descriptors.PEOE_VSA3(m)
    dict["PEOE_VSA4"] = Descriptors.PEOE_VSA4(m)
    dict["PEOE_VSA5"] = Descriptors.PEOE_VSA5(m)
    dict["PEOE_VSA6"] = Descriptors.PEOE_VSA6(m)
    dict["PEOE_VSA7"] = Descriptors.PEOE_VSA7(m)
    dict["PEOE_VSA8"] = Descriptors.PEOE_VSA8(m)
    dict["PEOE_VSA9"] = Descriptors.PEOE_VSA9(m)
    dict["PEOE_VSA10"] = Descriptors.PEOE_VSA10(m)
    dict["PEOE_VSA11"] = Descriptors.PEOE_VSA11(m)
    dict["PEOE_VSA12"] = Descriptors.PEOE_VSA12(m)
    dict["PEOE_VSA13"] = Descriptors.PEOE_VSA13(m)
    dict["PEOE_VSA14"] = Descriptors.PEOE_VSA14(m)
    
    return dict

smiles_list = df["Smiles"].tolist()
topological_dict = [calculate_topological_descriptors(x) for x in smiles_list]
topological_df = pd.DataFrame.from_dict(topological_dict)
topological_df



Unnamed: 0,chembl ID,Smiles,Standard Value,BalabanJ,TPSA,Ipc,HallKierAlpha,Kappa1,Kappa2,Kappa3,...,PEOE_VSA5,PEOE_VSA6,PEOE_VSA7,PEOE_VSA8,PEOE_VSA9,PEOE_VSA10,PEOE_VSA11,PEOE_VSA12,PEOE_VSA13,PEOE_VSA14
0,0 CHEMBL3235962 1 CHEMBL3235983 2 ...,0 N#Cc1cccc(NC(=O)N2CCc3ccccc3[C@H]2c2ccc...,0 83.000 1 10.000 2 0.41...,1.690515,56.13,1.039014e+07,-3.79,20.481470,8.149739,4.237594,...,0.000000,42.464569,53.443011,12.232143,23.237965,0.000000,0.000000,0.000000,0.00000,12.207413
1,0 CHEMBL3235962 1 CHEMBL3235983 2 ...,0 N#Cc1cccc(NC(=O)N2CCc3ccccc3[C@H]2c2ccc...,0 83.000 1 10.000 2 0.41...,2.023307,32.34,2.699429e+06,-2.78,21.917604,7.806619,4.364645,...,0.000000,30.331835,42.167647,6.544756,11.605292,11.859062,0.000000,0.000000,0.00000,18.383712
2,0 CHEMBL3235962 1 CHEMBL3235983 2 ...,0 N#Cc1cccc(NC(=O)N2CCc3ccccc3[C@H]2c2ccc...,0 83.000 1 10.000 2 0.41...,1.612535,50.27,3.098475e+07,-3.04,21.787307,7.495135,3.954292,...,0.000000,29.775636,55.009291,6.420822,22.160304,11.312736,5.824404,0.000000,0.00000,12.352597
3,0 CHEMBL3235962 1 CHEMBL3235983 2 ...,0 N#Cc1cccc(NC(=O)N2CCc3ccccc3[C@H]2c2ccc...,0 83.000 1 10.000 2 0.41...,1.392979,58.64,4.780479e+06,-2.74,20.529456,7.501947,3.961512,...,0.000000,17.667307,36.398202,31.618542,22.570358,11.350563,5.783245,0.000000,0.00000,12.207413
4,0 CHEMBL3235962 1 CHEMBL3235983 2 ...,0 N#Cc1cccc(NC(=O)N2CCc3ccccc3[C@H]2c2ccc...,0 83.000 1 10.000 2 0.41...,2.046188,63.40,1.415232e+06,-3.40,18.392847,8.319859,4.416427,...,0.000000,78.359856,30.183374,12.108208,0.000000,6.041841,0.000000,5.907180,5.90718,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
649,0 CHEMBL3235962 1 CHEMBL3235983 2 ...,0 N#Cc1cccc(NC(=O)N2CCc3ccccc3[C@H]2c2ccc...,0 83.000 1 10.000 2 0.41...,1.551374,57.26,7.802733e+05,-2.29,18.499572,7.527031,4.456538,...,0.000000,12.132734,46.616234,29.800917,11.250838,5.817863,0.000000,0.000000,0.00000,12.207413
650,0 CHEMBL3235962 1 CHEMBL3235983 2 ...,0 N#Cc1cccc(NC(=O)N2CCc3ccccc3[C@H]2c2ccc...,0 83.000 1 10.000 2 0.41...,2.335188,59.42,1.494366e+04,-1.59,12.856309,5.370257,2.626550,...,11.336786,23.733674,19.056471,10.586085,6.606882,5.007624,4.877147,5.879988,0.00000,5.969305
651,0 CHEMBL3235962 1 CHEMBL3235983 2 ...,0 N#Cc1cccc(NC(=O)N2CCc3ccccc3[C@H]2c2ccc...,0 83.000 1 10.000 2 0.41...,2.020584,102.50,4.517308e+04,-2.42,11.658106,4.687837,2.147235,...,21.534149,0.000000,12.137122,12.393687,12.741600,15.578699,0.000000,11.704393,0.00000,0.000000
652,0 CHEMBL3235962 1 CHEMBL3235983 2 ...,0 N#Cc1cccc(NC(=O)N2CCc3ccccc3[C@H]2c2ccc...,0 83.000 1 10.000 2 0.41...,2.332605,50.19,2.005623e+04,-1.75,13.676346,5.484467,3.172661,...,11.336786,25.980209,18.050640,11.984273,0.000000,15.701992,5.783245,5.879988,0.00000,0.000000


In [49]:
#Standardize values

scaler = StandardScaler()
val_only_topological_df = topological_df.drop(["Smiles", "chembl ID", "Standard Value"], axis=1)

def scale(series):
    array = series.tolist()
    reshape = np.reshape(array, (-1, 1))
    scaler.fit(reshape)
    return scaler.transform(reshape).tolist()

standard_topological_df = val_only_topological_df.apply(scale)
final_topological_df = standard_topological_df.assign(
    Smiles=topological_df["Smiles"], 
    chembl_ID=topological_df["chembl ID"], 
    Standard_Value=topological_df["Standard Value"])
final_topological_df

Unnamed: 0,BalabanJ,TPSA,Ipc,HallKierAlpha,Kappa1,Kappa2,Kappa3,Chi0,Chi1,Chi0n,...,PEOE_VSA8,PEOE_VSA9,PEOE_VSA10,PEOE_VSA11,PEOE_VSA12,PEOE_VSA13,PEOE_VSA14,Smiles,chembl_ID,Standard_Value
0,[-0.417214993177375],[-0.9975361492630956],[-0.10496707989299657],[-0.6127921571341065],[-0.24298347339498366],[-0.19663844696832664],[-0.17217798450740723],[-0.1067120903922174],[0.0037764700915145707],[-0.11689340693904678],...,[-0.7002306128887251],[0.7550498358335551],[-1.525386493545035],[-0.5918030527461284],[-0.6771588645746952],[-1.3810748987182646],[0.9439784211002837],0 N#Cc1cccc(NC(=O)N2CCc3ccccc3[C@H]2c2ccc...,0 CHEMBL3235962 1 CHEMBL3235983 2 ...,0 83.000 1 10.000 2 0.41...
1,[0.7349589311384751],[-2.0537947437128024],[-0.10889794083741827],[0.6703707259642091],[0.07626756287584677],[-0.3857859434072677],[-0.08572921767581845],[-0.050193994618899486],[-0.33521869620095873],[-0.3500570659438507],...,[-1.2707225306462093],[-0.6060629509094858],[0.4940408330365943],[-0.5918030527461284],[-0.6771588645746952],[-1.3810748987182646],[2.080681684268154],0 N#Cc1cccc(NC(=O)N2CCc3ccccc3[C@H]2c2ccc...,0 CHEMBL3235962 1 CHEMBL3235983 2 ...,0 83.000 1 10.000 2 0.41...
2,[-0.6871921271069986],[-1.2577158618934146],[-0.09444080909080257],[0.3400515679388971],[0.04730279685573712],[-0.5574937508457232],[-0.3649438062021891],[0.20571279463649153],[0.24579681790300553],[0.17140622564102603],...,[-1.2831542151860111],[0.6289551891153624],[0.40100941163123344],[0.5963460219116224],[-0.6771588645746952],[-1.3810748987182646],[0.970698491689003],0 N#Cc1cccc(NC(=O)N2CCc3ccccc3[C@H]2c2ccc...,0 CHEMBL3235962 1 CHEMBL3235983 2 ...,0 83.000 1 10.000 2 0.41...
3,[-1.4473270176779094],[-0.8860939856620549],[-0.10783427908514086],[0.7211890579681012],[-0.23231626589592966],[-0.5537385482957269],[-0.36003094644877004],[-0.2146633100313446],[-0.23914544772738525],[-0.3033392999079678],...,[1.2443857331081998],[0.6769346237158338],[0.40745077582519224],[0.5879496814436429],[-0.6771588645746952],[-1.3810748987182646],[0.9439784211002837],0 N#Cc1cccc(NC(=O)N2CCc3ccccc3[C@H]2c2ccc...,0 CHEMBL3235962 1 CHEMBL3235983 2 ...,0 83.000 1 10.000 2 0.41...
4,[0.8141760417617177],[-0.6747534682353797],[-0.10955431662291219],[-0.11731342009614414],[-0.7072824820093839],[-0.10285876625229295],[-0.05049578502087844],[-0.7182003963265425],[-0.6259603254010454],[-0.47627515175132773],...,[-0.7126622974285267],[-1.9639720090665136],[-0.4965480565290855],[-0.5918030527461284],[0.8342303034377767],[-0.02567620820557312],[-1.302708034646737],0 N#Cc1cccc(NC(=O)N2CCc3ccccc3[C@H]2c2ccc...,0 CHEMBL3235962 1 CHEMBL3235983 2 ...,0 83.000 1 10.000 2 0.41...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
649,[-0.8989405515309058],[-0.9473649760084437],[-0.10987885542116124],[1.2928952930119042],[-0.6835576590085997],[-0.5399105868931691],[-0.023203476033615753],[-0.8261516159656689],[-0.8817079113305956],[-0.9940253765997469],...,[1.062062840574275],[-0.647536878226913],[-0.5346882913937588],[-0.5918030527461284],[-0.6771588645746952],[-1.3810748987182646],[0.9439784211002837],0 N#Cc1cccc(NC(=O)N2CCc3ccccc3[C@H]2c2ccc...,0 CHEMBL3235962 1 CHEMBL3235983 2 ...,0 83.000 1 10.000 2 0.41...
650,[1.814738323509257],[-0.8514625563358349],[-0.11027002898828528],[2.1822161030800435],[-1.938049269178776],[-1.7288481066696777],[-1.2683710687539231],[-2.058784396272863],[-2.1768314554698436],[-2.2591592392702142],...,[-0.8653438421621913],[-1.190915663930591],[-0.672660332243532],[0.40311032031299626],[0.8272732141668314],[-1.3810748987182646],[-0.2041003846932004],0 N#Cc1cccc(NC(=O)N2CCc3ccccc3[C@H]2c2ccc...,0 CHEMBL3235962 1 CHEMBL3235983 2 ...,0 83.000 1 10.000 2 0.41...
651,[0.725532090536183],[1.0612579249123097],[-0.11025457819671762],[1.1277357139992492],[-2.204408628298594],[-2.105036586455935],[-1.5945084292869967],[-2.0321415104188203],[-1.9459374639393885],[-2.130338781934454],...,[-0.6840263661859587],[-0.4731061569570901],[1.1274414549628313],[-0.5918030527461284],[2.3174838174874455],[-1.3810748987182646],[-1.302708034646737],0 N#Cc1cccc(NC(=O)N2CCc3ccccc3[C@H]2c2ccc...,0 CHEMBL3235962 1 CHEMBL3235983 2 ...,0 83.000 1 10.000 2 0.41...
652,[1.8057944837091744],[-1.2612678033627704],[-0.11026741586553296],[1.9789427750644686],[-1.7557558534738493],[-1.6658887612794917],[-0.8967840563065305],[-1.8676721950063762],[-2.052681739461566],[-1.8931443168839255],...,[-0.7250939819683284],[-1.9639720090665136],[1.1484364501578477],[0.5879496814436429],[0.8272732141668314],[-1.3810748987182646],[-1.302708034646737],0 N#Cc1cccc(NC(=O)N2CCc3ccccc3[C@H]2c2ccc...,0 CHEMBL3235962 1 CHEMBL3235983 2 ...,0 83.000 1 10.000 2 0.41...
