In [17]:
import pandas as pd
import numpy as np

from rdkit import Chem
from rdkit.Chem import AllChem, Descriptors
from rdkit.Chem import SaltRemover
from rdkit.Chem.MolStandardize import rdMolStandardize

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

import torch
from torch_geometric.data import Data
from tqdm import tqdm

In [25]:
from rdkit import RDLogger
RDLogger.DisableLog('rdApp.*')

In [18]:
salt_remover = SaltRemover.SaltRemover()
uncharger = rdMolStandardize.Uncharger()
tautomer_enumerator = rdMolStandardize.TautomerEnumerator()

In [19]:
df = pd.read_csv("rdkit_datasheet.csv")

assert "smiles_raw" in df.columns
assert "activity_label" in df.columns

df["activity_label"] = df["activity_label"].astype(int)

print(df.shape)
df.head()

(169, 2)


Unnamed: 0,smiles_raw,activity_label
0,CCCCCCCCCC(=O)NC1CCCC1,0
1,C1=C(/C(=C/Br)/OC1=O)Br,1
2,C1=CC=C(C=C1)/C=C/C=O,0
3,CCCCCCCCCNC(=O)CC(=O)C1=CC=CC=C1,1
4,CCCCCCCCCCCCN1N=C(N=N1)CC(=O)O,1


In [20]:
def smiles_to_mol(smiles):
    try:
        mol = Chem.MolFromSmiles(smiles)
        return mol
    except:
        return None

df["mol"] = df["smiles_raw"].apply(smiles_to_mol)


In [22]:
df = df[df["mol"].notnull()].reset_index(drop=True)
print("After invalid removal:", df.shape)

After invalid removal: (169, 3)


In [23]:
def strip_salts(mol):
    try:
        return salt_remover.StripMol(mol, dontRemoveEverything=True)
    except:
        return None

df["mol"] = df["mol"].apply(strip_salts)
df = df[df["mol"].notnull()].reset_index(drop=True)

In [26]:
def neutralize_mol(mol):
    try:
        return uncharger.uncharge(mol)
    except:
        return mol

df["mol"] = df["mol"].apply(neutralize_mol)

In [27]:
def canonicalize_tautomer(mol):
    try:
        return tautomer_enumerator.Canonicalize(mol)
    except:
        return mol

df["mol"] = df["mol"].apply(canonicalize_tautomer)

In [28]:
def sanitize_mol(mol):
    try:
        Chem.SanitizeMol(mol)
        return mol
    except:
        return None

df["mol"] = df["mol"].apply(sanitize_mol)
df = df[df["mol"].notnull()].reset_index(drop=True)

In [29]:
df["smiles_canonical"] = df["mol"].apply(
    lambda m: Chem.MolToSmiles(m, canonical=True)
)


In [31]:
df = df.drop_duplicates(subset="smiles_canonical").reset_index(drop=True)
print("After deduplication:", df.shape)


After deduplication: (168, 4)


In [32]:
df["MolWt"] = df["mol"].apply(Descriptors.MolWt)
df["LogP"] = df["mol"].apply(Descriptors.MolLogP)
df["TPSA"] = df["mol"].apply(Descriptors.TPSA)
df["HBD"] = df["mol"].apply(Descriptors.NumHDonors)
df["HBA"] = df["mol"].apply(Descriptors.NumHAcceptors)
df["RB"] = df["mol"].apply(Descriptors.NumRotatableBonds)

In [33]:
for col in ["MolWt", "LogP", "TPSA"]:
    print(col, df[col].min(), df[col].max())


MolWt 132.16199999999995 660.0560000000002
LogP -1.3766000000000003 7.1868000000000025
TPSA 17.07 148.37


In [34]:
final_cols = [
    "smiles_canonical",
    "activity_label",
    "MolWt",
    "LogP",
    "TPSA",
    "HBD",
    "HBA",
    "RB"
]

df[final_cols].to_csv("qs_inhibitors_cleaned.csv", index=False)
print("Saved qs_inhibitors_cleaned.csv")

Saved qs_inhibitors_cleaned.csv


In [35]:
print("Final molecules:", len(df))
print("Unique SMILES:", df["smiles_canonical"].nunique())
print("Class balance:", df["activity_label"].value_counts())

Final molecules: 168
Unique SMILES: 168
Class balance: activity_label
1    92
0    76
Name: count, dtype: int64
