In [2]:
import pandas as pd
from rdkit import Chem
from rdkit.Chem import Descriptors

df = pd.read_csv(r"D:\drap-drug-repurposing\data\sample_molecules.csv")
df.head()

# Convert SMILES to RDKit mol objects
df["mol"] = df["smiles"].apply(Chem.MolFromSmiles)

df.head()

Unnamed: 0,smiles,activity,mol
0,CCOC(=O)C1=CC=CC=C1,7.2,<rdkit.Chem.rdchem.Mol object at 0x000002E0F73...
1,CCN(CC)CCOC1=CC=CC=C1,6.8,<rdkit.Chem.rdchem.Mol object at 0x000002E0F73...
2,CCOC(=O)NCCC1=CC=CC=C1,7.5,<rdkit.Chem.rdchem.Mol object at 0x000002E0F73...
3,CCOC(=O)C2=CC=CC=C2O,6.9,<rdkit.Chem.rdchem.Mol object at 0x000002E0F73...
4,CCN(CC)CCOC2=CC=CC=C2O,7.0,<rdkit.Chem.rdchem.Mol object at 0x000002E0F73...


In [3]:
def calc_descriptors(mol):
    return {
        "MolWt": Descriptors.MolWt(mol),
        "LogP": Descriptors.MolLogP(mol),
        "NumHDonors": Descriptors.NumHDonors(mol),
        "NumHAcceptors": Descriptors.NumHAcceptors(mol)
    }

descriptor_list = df["mol"].apply(calc_descriptors)
desc_df = pd.DataFrame(descriptor_list.tolist())

desc_df.head()


Unnamed: 0,MolWt,LogP,NumHDonors,NumHAcceptors
0,150.177,1.8633,0,2
1,193.29,2.4072,0,2
2,193.246,1.9752,1,2
3,166.176,1.5689,1,3
4,209.289,2.1128,1,3


In [4]:
final_df = pd.concat([df[["smiles", "activity"]], desc_df], axis=1)
final_df


Unnamed: 0,smiles,activity,MolWt,LogP,NumHDonors,NumHAcceptors
0,CCOC(=O)C1=CC=CC=C1,7.2,150.177,1.8633,0,2
1,CCN(CC)CCOC1=CC=CC=C1,6.8,193.29,2.4072,0,2
2,CCOC(=O)NCCC1=CC=CC=C1,7.5,193.246,1.9752,1,2
3,CCOC(=O)C2=CC=CC=C2O,6.9,166.176,1.5689,1,3
4,CCN(CC)CCOC2=CC=CC=C2O,7.0,209.289,2.1128,1,3
5,CCOC(=O)Nc1ccc(Cl)cc1,7.3,199.637,2.9084,1,2
6,CCN(CC)CCOc1ccc(Cl)cc1,6.7,227.735,3.0606,0,2
7,COC(=O)c1ccc(F)cc1,7.1,154.14,1.6123,0,2
8,CCC(=O)Nc1ccc(Br)cc1,6.8,228.089,2.7976,1,1
9,CCOC(=O)Nc1ccc(I)cc1,7.4,291.088,2.8596,1,2


In [5]:
final_df.to_csv("../data/descriptors_dataset.csv", index=False)
