# preprocess

In [2]:
! pip install rdkit

Collecting rdkit
  Downloading rdkit-2025.3.5-cp312-cp312-manylinux_2_28_x86_64.whl.metadata (4.1 kB)
Downloading rdkit-2025.3.5-cp312-cp312-manylinux_2_28_x86_64.whl (36.2 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m36.2/36.2 MB[0m [31m51.3 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: rdkit
Successfully installed rdkit-2025.3.5


In [None]:
import numpy as np
import pandas as pd
from rdkit import Chem

data = pd.read_csv("RAW_Specified_Small_GTPases_Bioactivity_Filtered.csv")
print("Raw filtered dataset:", data.shape)


Raw filtered dataset: (11059, 47)


In [3]:
def compute_pIC50(value):
    molar = float(value) * 1e-9  # Convert nM → M
    return -np.log10(molar)

data["pIC50"] = data["standard_value"].apply(compute_pIC50)
data = data.dropna(subset=[  "pIC50"])
print("After pIC50 conversion:", data.shape)

After pIC50 conversion: (11059, 48)


In [4]:
data = data[(data["pIC50"] >= 3) & (data["pIC50"] <= 10)]
print("After removing extreme values:", data.shape)

After removing extreme values: (10971, 48)


In [5]:
data = data.groupby("molecule_chembl_id").agg({
    "canonical_smiles": "first",
    "pIC50": "median"
}).reset_index()
print("After duplicate removal:", data.shape)

After duplicate removal: (7686, 3)


In [6]:
data["mol"] = data["canonical_smiles"].apply(Chem.MolFromSmiles)
data = data[~data["mol"].isna()].drop(columns=["mol"])
print("After removing invalid SMILES:", data.shape)

After removing invalid SMILES: (7686, 3)


In [7]:
# ---------- Step 6: Label datasets ----------
# Binary: active (≥6), inactive (<6)
def label_binary(p):
    return "active" if p >= 6 else "inactive"

# 3-class: inactive (≤5), intermediate (5–6), active (≥6)
def label_three_class(p):
    if p >= 6:
        return "active"
    elif p <= 5:
        return "inactive"
    else:
        return "intermediate"

binary = data.copy()
binary["bioactivity_class"] = binary["pIC50"].apply(label_binary)

threeclass = data.copy()
threeclass["bioactivity_class"] = threeclass["pIC50"].apply(label_three_class)

In [8]:
binary.to_csv("Binary_Preprocessed.csv", index=False)
threeclass.to_csv("3Class_Preprocessed.csv", index=False)


In [9]:
# ---------- Final Stats ----------
print("\n✅ Preprocessing complete!")
print("Binary dataset:", binary.shape)
print(binary["bioactivity_class"].value_counts())

print("\n3-Class dataset:", threeclass.shape)
print(threeclass["bioactivity_class"].value_counts())



✅ Preprocessing complete!
Binary dataset: (7686, 4)
bioactivity_class
inactive    6296
active      1390
Name: count, dtype: int64

3-Class dataset: (7686, 4)
bioactivity_class
inactive        4666
intermediate    1630
active          1390
Name: count, dtype: int64
