Extracting aromaticity for the nodes and bonds for the edges.

If aromaticity is True --> 0

If aromaticity is False --> 1

The bonds are classified by: single --> 0, double --> 1, triple --> 2 and aromatic --> 3.

In [7]:
import pandas as pd
from rdkit import Chem

# Path to cleaned csv file
df = pd.read_csv("1_data/processed_data/TRPM8_cleaned.csv")


# Filter out any rows where the "Smiles" column is an extra header row
df = df[df["Smiles"].str.strip().str.lower() != "smiles"]
print("DataFrame shape after filtering:", df.shape)

DataFrame shape after filtering: (1060, 4)


In [8]:
from rdkit.Chem import rdchem

# define a mapping for bond types
bond_type_map = {
    rdchem.BondType.SINGLE: 0,
    rdchem.BondType.DOUBLE: 1,
    rdchem.BondType.TRIPLE: 2,
    rdchem.BondType.AROMATIC: 3
}

# extracting aromaticity from each atom: true is 0 and false is 1
def extract_atom_aromaticity(smiles):

    if not isinstance(smiles, str):
        return None
    mol = Chem.MolFromSmiles(smiles)
    if mol is None:
        return None
    # if atom is aromatic True -> 0, else False -> 1
    aromaticity = [int(not atom.GetIsAromatic()) for atom in mol.GetAtoms()]
    return aromaticity

#extract bond features
def extract_bond_features(smiles):
    """
    Extracts bond features for each bond in the molecule.
    Returns a list of tuples, each containing:
      - Bond type (integer encoded)
      - Conjugation flag (converted to integer: True -> 1, False -> 0)
    """
    if not isinstance(smiles, str):
        return None
    mol = Chem.MolFromSmiles(smiles)
    if mol is None:
        return None
    bond_features = []
    for bond in mol.GetBonds():
        btype = bond_type_map.get(bond.GetBondType(), -1)
        is_conjugated = bond.GetIsConjugated()
        bond_features.append((btype, int(is_conjugated)))
    return bond_features


In [9]:
df["AtomAromaticity"] = df["Smiles"].apply(extract_atom_aromaticity)
df["BondFeatures"] = df["Smiles"].apply(extract_bond_features)

print(df.head())

  Molecule ChEMBL ID                                             Smiles  \
0      CHEMBL3235962  N#Cc1cccc(NC(=O)N2CCc3ccccc3[C@H]2c2ccc(C(F)(F...   
1      CHEMBL3235983  C[C@H](NC(=O)N1CCc2ccccc2[C@H]1c1ccc(C(F)(F)F)...   
2      CHEMBL1650511  FC(F)(F)c1ccccc1-c1cc(C(F)(F)F)c2[nH]c(C3=NOC4...   
3      CHEMBL2443068  O=C1CC2(CCN(C(=O)Nc3ccc(C(F)(F)F)cc3)CC2)Oc2c(...   
4      CHEMBL3959823  Cc1cccc(CN(C(=O)c2ccccc2)[C@@H](C(N)=O)c2ccccc...   

  pChEMBL Value  Potency_Class  \
0          7.08            2.0   
1             8            2.0   
2          9.38            2.0   
3          6.64            1.0   
4          6.06            1.0   

                                     AtomAromaticity  \
0  [1, 1, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, ...   
1  [1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 1, ...   
2  [1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, ...   
3  [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1, ...   
4  [1, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, ...   

       