In [3]:
import numpy as np
import pandas as pd

from rdkit import Chem
from rdkit.Chem import AllChem
from rdkit import DataStructs

In [4]:
df = pd.read_csv("qs_inhibitors_cleaned.csv")

assert "smiles_canonical" in df.columns
assert "activity_label" in df.columns

print("Dataset shape:", df.shape)
df.head()

Dataset shape: (168, 8)


Unnamed: 0,smiles_canonical,activity_label,MolWt,LogP,TPSA,HBD,HBA,RB
0,CCCCCCCCCC(=O)NC1CCCC1,0,239.403,4.1859,29.1,1,1,9
1,O=C1C=C(Br)/C(=C/Br)O1,1,253.877,2.0583,26.3,0,2,0
2,O=C/C=C/c1ccccc1,0,132.162,1.8987,17.07,0,1,2
3,CCCCCCCCCNC(=O)CC(=O)c1ccccc1,1,289.419,4.1262,46.17,1,2,11
4,CCCCCCCCCCCCn1nnc(CC(=O)O)n1,1,296.415,3.2211,80.9,1,5,13


In [5]:
def smiles_to_mol(smiles):
    mol = Chem.MolFromSmiles(smiles)
    if mol is None:
        raise ValueError(f"Invalid SMILES found: {smiles}")
    return mol

df["mol"] = df["smiles_canonical"].apply(smiles_to_mol)

In [6]:
MORGAN_RADIUS = 2
MORGAN_BITS = 2048

In [8]:
def mol_to_morgan_fp(mol, radius=2, n_bits=2048):
    fp = AllChem.GetMorganFingerprintAsBitVect(
        mol,
        radius=radius,
        nBits=n_bits
    )
    arr = np.zeros((n_bits,), dtype=np.int8)
    DataStructs.ConvertToNumpyArray(fp, arr)
    return arr

In [9]:
X = np.array([
    mol_to_morgan_fp(mol, MORGAN_RADIUS, MORGAN_BITS)
    for mol in df["mol"]
])

y = df["activity_label"].values



In [10]:
print("X shape:", X.shape)
print("y shape:", y.shape)

print("Unique labels:", np.unique(y))
print("Fingerprint bit density:", X.mean())

X shape: (168, 2048)
y shape: (168,)
Unique labels: [0 1]
Fingerprint bit density: 0.02120245070684524


In [11]:
np.save("X_morgan.npy", X)
np.save("y_labels.npy", y)

print("Saved X_morgan.npy and y_labels.npy")

Saved X_morgan.npy and y_labels.npy
