In [3]:
import pandas as pd
from rdkit import Chem
from rdkit.Chem import Descriptors
from rdkit.ML.Descriptors import MoleculeDescriptors

In [5]:
descriptor_list = [x[0] for x in Descriptors._descList]
print(descriptor_list)

['MaxEStateIndex', 'MinEStateIndex', 'MaxAbsEStateIndex', 'MinAbsEStateIndex', 'qed', 'MolWt', 'HeavyAtomMolWt', 'ExactMolWt', 'NumValenceElectrons', 'NumRadicalElectrons', 'MaxPartialCharge', 'MinPartialCharge', 'MaxAbsPartialCharge', 'MinAbsPartialCharge', 'FpDensityMorgan1', 'FpDensityMorgan2', 'FpDensityMorgan3', 'BalabanJ', 'BertzCT', 'Chi0', 'Chi0n', 'Chi0v', 'Chi1', 'Chi1n', 'Chi1v', 'Chi2n', 'Chi2v', 'Chi3n', 'Chi3v', 'Chi4n', 'Chi4v', 'HallKierAlpha', 'Ipc', 'Kappa1', 'Kappa2', 'Kappa3', 'LabuteASA', 'PEOE_VSA1', 'PEOE_VSA10', 'PEOE_VSA11', 'PEOE_VSA12', 'PEOE_VSA13', 'PEOE_VSA14', 'PEOE_VSA2', 'PEOE_VSA3', 'PEOE_VSA4', 'PEOE_VSA5', 'PEOE_VSA6', 'PEOE_VSA7', 'PEOE_VSA8', 'PEOE_VSA9', 'SMR_VSA1', 'SMR_VSA10', 'SMR_VSA2', 'SMR_VSA3', 'SMR_VSA4', 'SMR_VSA5', 'SMR_VSA6', 'SMR_VSA7', 'SMR_VSA8', 'SMR_VSA9', 'SlogP_VSA1', 'SlogP_VSA10', 'SlogP_VSA11', 'SlogP_VSA12', 'SlogP_VSA2', 'SlogP_VSA3', 'SlogP_VSA4', 'SlogP_VSA5', 'SlogP_VSA6', 'SlogP_VSA7', 'SlogP_VSA8', 'SlogP_VSA9', 'TPSA'

In [6]:
calc = MoleculeDescriptors.MolecularDescriptorCalculator([x[0] for x in Descriptors._descList])

In [7]:
header = calc.GetDescriptorNames()

In [8]:
df = pd.read_csv('Data/GSK_3D7.csv')

In [9]:
df.sample(n = 3, random_state = 42)

Unnamed: 0.1,Unnamed: 0,COMPOUND_ID,PCT_IHB_3D7,pXC50_3D7,SMILES
12750,23055,537180,93.0,5.85789,CN1CCN(CC1)Cc2ccc(cc2)c3cccc(c3)c4[nH]c5ccccc5...
7187,12603,536115,94.0,5.84021,CCc1cccc(c1)OCC2CCCN(C2)c3ccc(cn3)C(=O)NC4CC4
457,1853,526437,92.0,5.87235,Cn1c(=O)c(c(cn1)c2ccccc2CN3CCCC3CN(C)C)Oc4cccc...


In [10]:
mol = Chem.MolFromSmiles(df.SMILES[0])

In [12]:
ds = calc.CalcDescriptors(mol)

In [18]:
desc_df = pd.DataFrame(columns = header)
for i in range(0, len(df)):
    mol = Chem.MolFromSmiles(df.SMILES[i])
    desc_df.loc[i, :] = pd.Series(calc.CalcDescriptors(mol), index = desc_df.columns)

In [19]:
desc_df.shape

(13403, 200)

In [17]:
len(df)

13403

In [22]:
desc_df.head(3)

Unnamed: 0,MaxEStateIndex,MinEStateIndex,MaxAbsEStateIndex,MinAbsEStateIndex,qed,MolWt,HeavyAtomMolWt,ExactMolWt,NumValenceElectrons,NumRadicalElectrons,...,fr_sulfide,fr_sulfonamd,fr_sulfone,fr_term_acetylene,fr_tetrazole,fr_thiazole,fr_thiocyan,fr_thiophene,fr_unbrch_alkane,fr_urea
0,12.9537,0.0145533,12.9537,0.0145533,0.428843,528.697,488.377,528.31,206,0,...,0,0,0,0,0,0,0,0,0,0
1,12.6154,-0.575289,12.6154,0.0163207,0.273796,552.1,521.86,551.176,198,0,...,0,0,0,0,0,0,0,1,0,0
2,12.8107,-4.37844,12.8107,0.214526,0.219578,579.572,547.316,579.219,222,0,...,0,0,0,0,0,0,0,0,1,0


In [23]:
df.head(3)

Unnamed: 0.1,Unnamed: 0,COMPOUND_ID,PCT_IHB_3D7,pXC50_3D7,SMILES
0,0,541908,94.0,6.09857,COc1ccc(c(c1)OC)C2CCN(CC2)CCN3CCC(CC3)NC(=O)c4...
1,1,538495,100.0,6.00393,C[C@H](c1ccccc1Cl)Oc2cc(sc2C(=O)N)n3cnc4c3cc(c...
2,2,538533,100.0,6.79958,COc1ccc2c(c1)c(ccn2)[C@H](CN3CCC(CC3)NCCOc4ccc...


In [25]:
alles = df.join(desc_df)

In [26]:
alles.sample(n = 3, random_state = 42)

Unnamed: 0.1,Unnamed: 0,COMPOUND_ID,PCT_IHB_3D7,pXC50_3D7,SMILES,MaxEStateIndex,MinEStateIndex,MaxAbsEStateIndex,MinAbsEStateIndex,qed,...,fr_sulfide,fr_sulfonamd,fr_sulfone,fr_term_acetylene,fr_tetrazole,fr_thiazole,fr_thiocyan,fr_thiophene,fr_unbrch_alkane,fr_urea
12750,23055,537180,93.0,5.85789,CN1CCN(CC1)Cc2ccc(cc2)c3cccc(c3)c4[nH]c5ccccc5...,10.5787,-5.08333,10.5787,0.916423,0.402453,...,0,0,0,0,0,0,0,0,0,0
7187,12603,536115,94.0,5.84021,CCc1cccc(c1)OCC2CCCN(C2)c3ccc(cn3)C(=O)NC4CC4,12.1191,-0.0131664,12.1191,0.0131664,0.794319,...,0,0,0,0,0,0,0,0,0,0
457,1853,526437,92.0,5.87235,Cn1c(=O)c(c(cn1)c2ccccc2CN3CCCC3CN(C)C)Oc4cccc...,13.0454,-5.08333,13.0454,0.20954,0.458633,...,0,0,0,0,0,0,0,0,0,0


In [27]:
alles.to_csv('Data/GSK_3D7_features.csv')