In [1]:
import sys
from pathlib import Path

project_root = Path().resolve().parents[1]
sys.path.append(str(project_root))

In [4]:
import requests
from time import sleep

import pandas as pd
from tqdm import tqdm

from src.constants import HACKATHON_DATA, ALPHAFOLD_STRUCTURES_DIR, PDB_STRUCTURES_DIR

### 1. Find unique PDB and AlphaFoldDB IDs

In [5]:
data = pd.read_csv(HACKATHON_DATA)
data.head()

Unnamed: 0,pdb_id,query_poi,chain,uniprot_id,pdb_pocket_selection,alphafold_pocket_selection
0,3kr4,K_BES_1003,K,Q8IL11,3kr4 and ( (chain K and resi 374+379+386+392+3...,AF-Q8IL11-F1-model_v4 and ( resi 374+379+386+3...
1,3kun,B_HEM_139,B,Q9NAV8,3kun and ( (chain B and resi 24+31+34+35+36+51...,AF-Q9NAV8-F1-model_v4 and ( resi 25+32+35+36+3...
2,5t67,A_SAH_502,A,B3TMQ9,5t67 and ( (chain A and resi 72+76+78+80+90+11...,AF-B3TMQ9-F1-model_v4 and ( resi 72+76+78+80+9...
3,3kr6,A_FFQ_500,A,P0A749,3kr6 and ( (chain A and resi 22+23+49+91+114+1...,AF-P0A749-F1-model_v4 and ( resi 22+23+49+91+1...
4,3kpz,A_ZNE_525,A,P11473,3kpz and ( (chain A and resi 143+147+150+227+2...,AF-P11473-F1-model_v4 and ( resi 143+147+150+2...


In [6]:
unique_pdb_ids = data['pdb_id'].unique()
print(f"Number of unique PDB IDs: {len(unique_pdb_ids)}")
print(f"Unique PDB IDs: {unique_pdb_ids}")

Number of unique PDB IDs: 24988
Unique PDB IDs: ['3kr4' '3kun' '5t67' ... '1w7c' '1w34' '1w77']


In [7]:
unique_uniprot_ids = data['uniprot_id'].unique()
print(f"Number of unique UniProt IDs: {len(unique_uniprot_ids)}")
print(f"Unique UniProt IDs: {unique_uniprot_ids}")

Number of unique UniProt IDs: 8130
Unique UniProt IDs: ['Q8IL11' 'Q9NAV8' 'B3TMQ9' ... 'Q96AZ6' 'Q96X16' 'P69834']


### 2. Download apo structure predictions from AlphaFoldDB

In [11]:
def get_alphafold_url(uniprot_id: str) -> str:
    return f"https://alphafold.ebi.ac.uk/files/AF-{uniprot_id}-F1-model_v4.pdb"

def get_pdb_url(pdb_id: str) -> str:
    return f"https://files.rcsb.org/download/{pdb_id}.pdb"

In [None]:
def download_structure(
    structure_id: str,
    database: str,
    out_dir: Path,
    format: str = "pdb",
    overwrite: bool = False
) -> str:
    """
    Download a structure file from the given URL and save it to the specified directory.

    Args:
        structure_id (str): The ID of the structure to download.
        out_dir (Path): The directory to save the downloaded file.
        format (str): The format of the structure file (e.g., "pdb").
        overwrite (bool): Whether to overwrite the file if it already exists.

    Returns:
        str: File download status ("downloaded", "not found", or "error").
    """
    assert database in ["alphafold", "pdb"], "Database must be either 'alphafold' or 'pdb'."
    assert format in ["pdb", "cif"], "Format must be either 'pdb' or 'cif'."

    url = get_alphafold_url(structure_id) if database == "alphafold" else get_pdb_url(structure_id)
    out_file = out_dir / f"{structure_id}.{format}"

    if out_file.exists() and not overwrite:
        print(f"File {out_file} already exists. Skipping download.")
        return

    r = requests.get(url)
    if r.status_code == 200:
        out_file.write_bytes(r.content)
        return "downloaded"
    elif r.status_code == 404:
        return "not found"
    else:
        return f"error ({r.status_code})"

In [18]:
def download_all_structures(
    structure_ids: list[str],
    database: str,
    out_dir: Path,
    format: str = "pdb",
    overwrite: bool = False,
    delay: float = 0.5
) -> None:
    """
    Download all structures from the given list of structure IDs.

    Args:
        structure_ids (list): List of structure IDs to download.
        out_dir (Path): The directory to save the downloaded files.
        format (str): The format of the structure files (e.g., "pdb").
        overwrite (bool): Whether to overwrite the files if they already exist.
        delay (float): Delay in seconds between downloads to avoid overwhelming the server.
    """
    out_dir.mkdir(parents=True, exist_ok=True)

    results = {}
    for structure_id in tqdm(structure_ids):
        result = download_structure(structure_id, database, out_dir, format, overwrite)
        results[structure_id] = result
        sleep(delay)

    return results

In [21]:
unique_pdb_ids[:3]

array(['3kr4', '3kun', '5t67'], dtype=object)

In [22]:
pdb_ids = unique_pdb_ids[:3]
results = download_all_structures(
    pdb_ids,
    database="pdb",
    out_dir=PDB_STRUCTURES_DIR,
    format="pdb",
    overwrite=False,
    delay=0.5
)
print("PDB download results:")
for pdb_id, result in results.items():
    print(f"{pdb_id}: {result}")

  0%|          | 0/3 [00:02<?, ?it/s]


NameError: name 'out_path' is not defined

In [21]:
def download_alphafold_structure(uniprot_id: str, out_dir: Path, format: str = "pdb", overwrite=False):
    """
    Downloads AlphaFold predicted structure for a given UniProt ID.

    Args:
        uniprot_id (str): UniProt accession ID (e.g., 'P12345')
        out_dir (Path): Directory to save the file
        format (str): 'pdb' or 'cif'
        overwrite (bool): Whether to overwrite if file exists
    """
    assert format in {"pdb", "cif"}
    url = f"https://alphafold.ebi.ac.uk/files/AF-{uniprot_id}-F1-model_v4.{format}"
    out_path = out_dir / f"{uniprot_id}.{format}"

    if out_path.exists() and not overwrite:
        return "exists"

    r = requests.get(url)
    if r.status_code == 200:
        out_path.write_bytes(r.content)
        return "downloaded"
    elif r.status_code == 404:
        return "not found"
    else:
        return f"error ({r.status_code})"

def download_all_alphafold(uniprot_ids, out_dir, format="pdb", delay=0.5):
    out_dir = Path(out_dir)
    out_dir.mkdir(parents=True, exist_ok=True)

    results = {}
    for uid in tqdm(uniprot_ids):
        result = download_alphafold_structure(uid, out_dir, format=format)
        results[uid] = result
        sleep(delay)  # polite pause between requests

    return results

In [20]:
uniprot_ids = ["P12345", "Q9H9K5", "O60260"]  # Replace with your actual list
results = download_all_alphafold(uniprot_ids, out_dir=ALPHAFOLD_STRUCTURES_DIR, format="pdb")
print(results)

100%|██████████| 3/3 [00:02<00:00,  1.49it/s]

{'P12345': 'downloaded', 'Q9H9K5': 'downloaded', 'O60260': 'downloaded'}





### 3. Download holo structures from PDB

In [None]:
def download_pdb_structure(pdb_id: str, out_dir: Path, format: str = "pdb", overwrite=False):
    """
    Downloads PDB structure by PDB ID.

    Args:
        pdb_id (str): e.g., '1TUP'
        out_dir (Path): directory to save file
        format (str): 'pdb' or 'cif'
        overwrite (bool): overwrite if file exists
    """
    assert format in {"pdb", "cif"}
    url = f"https://files.rcsb.org/download/{pdb_id.upper()}.{format}"
    out_path = out_dir / f"{pdb_id.upper()}.{format}"

    if out_path.exists() and not overwrite:
        return "exists"

    r = requests.get(url)
    if r.status_code == 200:
        out_path.write_bytes(r.content)
        return "downloaded"
    elif r.status_code == 404:
        return "not found"
    else:
        return f"error ({r.status_code})"

def download_all_pdb(pdb_ids, out_dir, format="pdb", delay=0.5):
    out_dir = Path(out_dir)
    out_dir.mkdir(parents=True, exist_ok=True)

    results = {}
    for pdb_id in tqdm(pdb_ids):
        result = download_pdb_structure(pdb_id, out_dir, format=format)
        results[pdb_id] = result
        sleep(delay)

    return results

In [10]:
pdb_ids = ["1TUP", "4HHB", "2BEG"]  # Replace with your actual list
results = download_all_pdb(pdb_ids, )
print(results)

TypeError: download_all_pdb() missing 1 required positional argument: 'out_dir'