# Compute molecule descriptors

Use rdkit to compute a bunch of molecular descriptors for each smiles string.

Define function to compute all rdkit descriptors.

In [1]:
from rdkit import Chem
from rdkit.Chem import Descriptors


# Function to compute various molecular descriptors
def compute_descriptors(smiles):
    mol = Chem.MolFromSmiles(smiles)
    if mol is not None:
        descriptors = [desc[1](mol) for desc in Descriptors.descList]
        return descriptors
    else:
        return [None] * len(Descriptors.descList)  # Handle invalid SMILES


descriptor_columns = [desc[0] for desc in Descriptors.descList]

Load in our original dataframe.

In [2]:
import os
import pandas as pd

CSV_PATH = "https://gitlab.com/oasci/courses/pitt/biosc1540-2024s/-/raw/main/biosc1540/files/csv/pka/pka_data.csv"

CSV_DIR = "../../files/csv/pka"
CSV_SAVE_PATH = os.path.join(CSV_DIR, "pka_with_desc.csv")

df_smiles = pd.read_csv(CSV_PATH)

Compute all molecular descriptors.
This takes about 5 seconds.

In [3]:
descriptors = pd.DataFrame(
    df_smiles["SMILES"].apply(compute_descriptors).tolist(), columns=descriptor_columns
)

In [4]:
print(descriptors)

      MaxAbsEStateIndex  MaxEStateIndex  MinAbsEStateIndex  MinEStateIndex  \
0              4.377685        4.377685           0.946736        0.946736   
1              4.292315        4.292315           0.885370        0.885370   
2              4.212037        4.212037           1.029514        1.029514   
3              4.201481        4.201481           1.037685        1.037685   
4              4.233866        4.233866           1.023148        1.023148   
...                 ...             ...                ...             ...   
1709           4.057982        4.057982           0.836046        0.836046   
1710           3.722222        3.722222           1.638889        1.638889   
1711           4.333333        4.333333           1.513889        1.513889   
1712           3.740741        3.740741           1.601852        1.601852   
1713           3.555556        3.555556           1.444444        1.444444   

           qed        SPS    MolWt  HeavyAtomMolWt  ExactMolWt 

In [5]:
df_smiles = pd.concat([df_smiles, descriptors], axis=1)

df_smiles = df_smiles.dropna(how="any")

df_smiles.to_csv(CSV_SAVE_PATH, index=False)