# COLLECTION OF DATA

In [None]:
! pip install chembl_webresource_client

Collecting chembl_webresource_client
  Downloading chembl_webresource_client-0.10.9-py3-none-any.whl.metadata (1.4 kB)
Collecting requests-cache~=1.2 (from chembl_webresource_client)
  Downloading requests_cache-1.2.1-py3-none-any.whl.metadata (9.9 kB)
Collecting cattrs>=22.2 (from requests-cache~=1.2->chembl_webresource_client)
  Downloading cattrs-25.2.0-py3-none-any.whl.metadata (8.4 kB)
Collecting url-normalize>=1.4 (from requests-cache~=1.2->chembl_webresource_client)
  Downloading url_normalize-2.2.1-py3-none-any.whl.metadata (5.6 kB)
Downloading chembl_webresource_client-0.10.9-py3-none-any.whl (55 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m55.2/55.2 kB[0m [31m4.6 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading requests_cache-1.2.1-py3-none-any.whl (61 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m61.4/61.4 kB[0m [31m5.3 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading cattrs-25.2.0-py3-none-any.whl (70 kB)
[2K   [90m━━━━━━━━━━━━━━

In [3]:
import pandas as pd
import time
from chembl_webresource_client.new_client import new_client

# Initialize ChEMBL client
activity = new_client.activity
target = new_client.target

# Step 1: Search for Small GTPase family targets
target_query = target.search("Small GTPase")
targets = pd.DataFrame.from_dict(target_query)

In [4]:
# Step 2: Filter only human small GTPases
human_gtpases = targets[
    (targets["organism"] == "Homo sapiens") &
    (targets["target_type"] == "SINGLE PROTEIN")
]
human_gtpases = human_gtpases.dropna(subset=["target_chembl_id", "pref_name"])
human_gtpases = human_gtpases.drop_duplicates(subset=["target_chembl_id"])

print(f"✅ Found {len(human_gtpases)} human small GTPase targets")
print(human_gtpases[["pref_name", "target_chembl_id"]])

✅ Found 69 human small GTPase targets
                                             pref_name target_chembl_id
0                            GTP-binding protein SAR1a    CHEMBL4295960
3          Dynamin-like 120 kDa protein, mitochondrial    CHEMBL4105705
4    ADP-ribosylation factor GTPase-activating prot...    CHEMBL5465332
5                     Transforming protein p21/H-Ras-1       CHEMBL2167
6                                          GTPase NRas    CHEMBL2079845
..                                                 ...              ...
124                                          Caspase-7       CHEMBL3468
125                                         Caspase-10       CHEMBL5037
127                                         Caspase-14       CHEMBL5991
133                             C-C motif chemokine 22    CHEMBL4295649
134  Aminoacyl tRNA synthase complex-interacting mu...    CHEMBL4295810

[69 rows x 2 columns]


In [5]:
bioactivity_list = []

for idx, row in human_gtpases.iterrows():
    target_name = row["pref_name"]
    target_id = row["target_chembl_id"]

    print(f"\nFetching {target_name} ({target_id})...")

    try:
        # Strict filters
        records = activity.filter(
            target_chembl_id=target_id,
            standard_type="IC50",
            standard_units="nM"
        )

        df = pd.DataFrame.from_dict(records)

        if df.empty:
            print(f"⚠️ No IC50 records for {target_name}")
            continue

        # Remove NaNs
        df = df.dropna(subset=["standard_value", "canonical_smiles"])

        # Keep only positive IC50 values
        df = df[df["standard_value"].astype(float) > 0]

        if df.empty:
            print(f"⚠️ No usable records after filtering for {target_name}")
            continue

        # Add metadata
        df["target_name"] = target_name
        df["target_chembl_id"] = target_id

        bioactivity_list.append(df)
        print(f"✅ Added {len(df)} filtered records for {target_name}")

        time.sleep(0.5)  # avoid API rate limits

    except Exception as e:
        print(f"❌ Error fetching {target_name}: {str(e)}")
        continue



Fetching GTP-binding protein SAR1a (CHEMBL4295960)...
⚠️ No IC50 records for GTP-binding protein SAR1a

Fetching Dynamin-like 120 kDa protein, mitochondrial (CHEMBL4105705)...
⚠️ No IC50 records for Dynamin-like 120 kDa protein, mitochondrial

Fetching ADP-ribosylation factor GTPase-activating protein 1 (CHEMBL5465332)...
⚠️ No IC50 records for ADP-ribosylation factor GTPase-activating protein 1

Fetching Transforming protein p21/H-Ras-1 (CHEMBL2167)...
✅ Added 62 filtered records for Transforming protein p21/H-Ras-1

Fetching GTPase NRas (CHEMBL2079845)...
✅ Added 1 filtered records for GTPase NRas

Fetching GTPase KRas (CHEMBL2189121)...
✅ Added 1052 filtered records for GTPase KRas

Fetching Ras GTPase-activating-like protein IQGAP1 (CHEMBL4295763)...
⚠️ No IC50 records for Ras GTPase-activating-like protein IQGAP1

Fetching Rho-GTPase-activating protein 3 (CHEMBL4504)...
⚠️ No IC50 records for Rho-GTPase-activating protein 3

Fetching Arf-GAP with SH3 domain, ANK repeat and PH dom

In [6]:
if bioactivity_list:
    specified_df = pd.concat(bioactivity_list, ignore_index=True)
    print(f"\n🎯 Total records collected after strict filtering: {specified_df.shape[0]}")

    # Save to CSV
    specified_df.to_csv("RAW_Specified_Small_GTPases_Bioactivity_Filtered.csv", index=False)
    print("💾 Saved as RAW_Specified_Small_GTPases_Bioactivity_Filtered.csv")

    # Preview
    print(specified_df[[
        "molecule_chembl_id", "canonical_smiles",
        "standard_value", "standard_units",
        "target_name", "target_chembl_id"
    ]].head())
else:
    print("\n❌ No data collected after applying filters")

  specified_df = pd.concat(bioactivity_list, ignore_index=True)



🎯 Total records collected after strict filtering: 11059
💾 Saved as RAW_Specified_Small_GTPases_Bioactivity_Filtered.csv
  molecule_chembl_id                                   canonical_smiles  \
0        CHEMBL11252  Cn1cc(NC(=O)c2cc(NC(=O)c3cc(NC=O)cn3C)cn2C)cc1...   
1       CHEMBL552863  Cl.Cn1cc(NC(=O)c2cc(NC(=O)c3cc(NC(=O)c4cc5cc(N...   
2       CHEMBL554247  Cl.Cn1cc(NC(=O)c2cc(NC(=O)c3cc(NC(=O)c4cc5cc(N...   
3       CHEMBL536328  Cl.Cn1cc(NC(=O)c2cc(NC(=O)c3cc(NC(=O)c4cc5cc(N...   
4       CHEMBL552690  Cl.Cn1cc(NC(=O)c2cc(NC(=O)c3cc(NC(=O)c4nc5cc(N...   

  standard_value standard_units                       target_name  \
0          200.0             nM  Transforming protein p21/H-Ras-1   
1           20.0             nM  Transforming protein p21/H-Ras-1   
2           15.0             nM  Transforming protein p21/H-Ras-1   
3           20.0             nM  Transforming protein p21/H-Ras-1   
4           15.0             nM  Transforming protein p21/H-Ras-1   

  target_chem