In [12]:
import requests
import pandas as pd
import numpy as np
import time

# ChEMBL API endpoints
ACTIVITY_URL = "https://www.ebi.ac.uk/chembl/api/data/activity.json"
TARGET_URL = "https://www.ebi.ac.uk/chembl/api/data/target"
UNIPROT_URL = "https://www.uniprot.org/uniprot/{}.fasta"

# Parameters
NUM_PAIRS = 30000  # Target number of virus-drug interactions
BATCH_SIZE = 1000  # Fetch data in batches
RETRY_LIMIT = 3  # Number of retries for API requests


def request_with_retry(url):
    """Handles API requests with retries for SSL errors, timeouts, and connection errors."""
    for attempt in range(RETRY_LIMIT):
        try:
            response = requests.get(url, timeout=10)
            if response.status_code == 200:
                return response  # Return response if successful
        except (requests.exceptions.SSLError, 
                requests.exceptions.ConnectionError, 
                requests.exceptions.Timeout) as e:
            print(f"⚠️ Request error: {e}, retrying ({attempt+1}/{RETRY_LIMIT})...")
            time.sleep(2)  # Wait before retrying
    
    return None  # Return None if all retries fail


def get_virus_activity(offset=0, limit=BATCH_SIZE):
    """Fetches activity data and filters for virus targets with IC50 values."""
    url = f"{ACTIVITY_URL}?limit={limit}&offset={offset}"
    response = request_with_retry(url)
    
    if not response:
        return []  # Return empty list if request fails

    try:
        activities = response.json().get("activities", [])
    except ValueError:
        print("⚠️ Failed to parse JSON from response.")
        return []

    filtered_activities = []
    for act in activities:
        organism = act.get("target_organism", "")
        ic50_value = act.get("standard_value")
        standard_type = act.get("standard_type")

        if organism and "virus" in organism.lower() and standard_type == "IC50" and ic50_value:
            try:
                ic50_value = float(ic50_value)
                if ic50_value > 0:  # Ensure valid IC50 values
                    pIC50 = -np.log10(ic50_value * 1e-9)  # Convert nM to M, then -log10
                    act["pIC50"] = pIC50  # Store calculated pIC50
                    filtered_activities.append(act)
            except ValueError:
                continue  # Skip invalid values

    return filtered_activities


def get_protein_sequence(uniprot_id):
    """Fetches protein sequence from UniProt."""
    url = UNIPROT_URL.format(uniprot_id)
    response = request_with_retry(url)

    if not response:
        return None  # Return None if request fails

    try:
        lines = response.text.split("\n")
        return "".join(lines[1:])  # Ignore FASTA header
    except Exception:
        return None  # Handle parsing errors



def fetch_data(offset):
    """Fetches virus-drug interactions and stores them in a structured dataset."""
    data = []
    activities = get_virus_activity(offset, BATCH_SIZE)
    if not activities:
        return None  # Stop if no more data

    for act in activities:
        target_id = act.get("target_chembl_id")
        drug_smiles = act.get("canonical_smiles")
        organism_name = act.get("target_pref_name")
        pIC50 = act.get("pIC50")

        if target_id and drug_smiles and pIC50:
           
            # Get target protein sequence
            target_response = request_with_retry(f"{TARGET_URL}/{target_id}.json")
            
            if target_response:
                try:
                    target_data = target_response.json()
                    target_components = target_data.get("target_components", [])
                    
                    if target_components:  # Ensure non-empty list
                        uniprot_id = target_components[0].get("accession")
                        
                        if uniprot_id:
                            sequence = get_protein_sequence(uniprot_id)
                            if sequence:
                                data.append([organism_name, sequence, drug_smiles, pIC50])
                except ValueError:
                    print(f"⚠️ Failed to parse JSON for target {target_id}.")
    return data





In [13]:
all_data=[]
offset = 0
while(len(all_data)<NUM_PAIRS): 
    print(f"🔍 Fetching records from offset {offset} (Current size: {len(all_data)})")
    data = fetch_data(offset)
    if(data):
        all_data+=(data)
    offset += BATCH_SIZE  # Move to next batch
    time.sleep(1)  # Avoid hitting API rate limits

df = pd.DataFrame(all_data, columns=["Virus_Organism", "Protein_Sequence", "SMILES", "pIC50"])
# Save to CSV
df.to_csv(r"E:\anti\virus_drug_interactions.csv", index=False)
print("✅ Data saved to virus_drug_interactions.csv")

🔍 Fetching records from offset 0 (Current size: 0)
🔍 Fetching records from offset 1000 (Current size: 8)
🔍 Fetching records from offset 2000 (Current size: 18)
🔍 Fetching records from offset 3000 (Current size: 27)
🔍 Fetching records from offset 4000 (Current size: 39)
🔍 Fetching records from offset 5000 (Current size: 48)
🔍 Fetching records from offset 6000 (Current size: 57)
🔍 Fetching records from offset 7000 (Current size: 67)
🔍 Fetching records from offset 8000 (Current size: 81)
🔍 Fetching records from offset 9000 (Current size: 97)
🔍 Fetching records from offset 10000 (Current size: 106)
🔍 Fetching records from offset 11000 (Current size: 114)
🔍 Fetching records from offset 12000 (Current size: 124)
🔍 Fetching records from offset 13000 (Current size: 128)
🔍 Fetching records from offset 14000 (Current size: 139)
🔍 Fetching records from offset 15000 (Current size: 152)
🔍 Fetching records from offset 16000 (Current size: 157)
🔍 Fetching records from offset 17000 (Current size: 176)

KeyboardInterrupt: 

In [14]:
df = pd.DataFrame(all_data, columns=["Virus_Organism", "Protein_Sequence", "SMILES", "pIC50"])
# Save to CSV
df.to_csv(r"E:\anti\virus_drug_interactions.csv", index=False)
print("✅ Data saved to virus_drug_interactions.csv")

✅ Data saved to virus_drug_interactions.csv
