# Receptor: Create df


In [1]:
%pip install requests
import requests
import pandas as pd

# Define the two UniProt accession codes
uniprot_accessions = ["P09601", "A0A7I2V3I1"]

# Dictionary holding additional info for each accession (adjust as needed)
info_dict = {
    "P09601": {
        "Gene Symbol": "HMOX1",
        "Ensembl Gene ID": "ENSG00000100292",
        "Pfam IDs": "Pf01126"
    },
    "A0A7I2V3I1": {
        "Gene Symbol": "Gene2",         # Replace with correct gene symbol
        "Ensembl Gene ID": "ENSG000002XXXX",  # Replace with correct Ensembl ID
        "Pfam IDs": "PfXXXXX"           # Replace with correct Pfam ID(s)
    }
}

# Prepare the DataFrame with the required columns
columns = ["Gene Symbol", "Ensembl Gene ID", "Accession", "Pfam IDs",
           "Identifier", "Method", "Resolution", "Chain", "Positions"]
df = pd.DataFrame(columns=columns)

# Loop over each accession
for uniprot_accession in uniprot_accessions:
    # Build the JSON query to get PDB IDs for experimental structures
    query = {
        "query": {
            "type": "group",
            "logical_operator": "and",
            "nodes": [
                {
                    "type": "terminal",
                    "service": "text",
                    "parameters": {
                        "attribute": "rcsb_polymer_entity_container_identifiers.reference_sequence_identifiers.database_accession",
                        "operator": "exact_match",
                        "value": uniprot_accession
                    }
                },
                {
                    "type": "terminal",
                    "service": "text",
                    "parameters": {
                        "attribute": "rcsb_polymer_entity_container_identifiers.reference_sequence_identifiers.database_name",
                        "operator": "exact_match",
                        "value": "UniProt"
                    }
                }
            ]
        },
        "return_type": "entry",
        "request_options": {"return_all_hits": True}
    }
    
    # URL for the RCSB search API
    rcsb_url = "https://search.rcsb.org/rcsbsearch/v2/query"
    
    # Send the POST request
    response = requests.post(rcsb_url, json=query)
    
    if response.status_code == 200:
        results = response.json()
        pdb_ids = [entry["identifier"] for entry in results.get("result_set", [])]
    
        # Process experimental structures for this accession
        for pdb_id in pdb_ids:
            pdb_url = f"https://data.rcsb.org/rest/v1/core/entry/{pdb_id}"
            pdb_response = requests.get(pdb_url)
    
            if pdb_response.status_code == 200:
                pdb_data = pdb_response.json()
                method = pdb_data.get("rcsb_entry_info", {}).get("experimental_method", "N/A")
                # Extract resolution using the "resolution_combined" field
                resolution = pdb_data.get("rcsb_entry_info", {}).get("resolution_combined", ["N/A"])[0]
                if resolution != "N/A":
                    resolution = f"{resolution} Å"
    
                polymer_entities = pdb_data.get("rcsb_entry_container_identifiers", {}).get("polymer_entity_ids", [])
    
                for entity_id in polymer_entities:
                    entity_url = f"https://data.rcsb.org/rest/v1/core/polymer_entity/{pdb_id}/{entity_id}"
                    entity_response = requests.get(entity_url)
    
                    if entity_response.status_code == 200:
                        entity_data = entity_response.json()
                        # Concatenate chain IDs with '/'
                        chain = "/".join(entity_data.get("rcsb_polymer_entity_container_identifiers", {}).get("auth_asym_ids", []))
                        # Use the length of the canonical sequence (if available) as a proxy for Positions
                        seq = entity_data.get("entity_poly", {}).get("pdbx_seq_one_letter_code_can", "N/A")
                        positions_range = f"1-{len(seq)}" if seq != "N/A" else "N/A"
    
                        new_row = pd.DataFrame({
                            "Gene Symbol": [info_dict[uniprot_accession]["Gene Symbol"]],
                            "Ensembl Gene ID": [info_dict[uniprot_accession]["Ensembl Gene ID"]],
                            "Accession": [uniprot_accession],
                            "Pfam IDs": [info_dict[uniprot_accession]["Pfam IDs"]],
                            "Identifier": [pdb_id],
                            "Method": [method],
                            "Resolution": [resolution],
                            "Chain": [chain],
                            "Positions": [positions_range]
                        })
                        df = pd.concat([df, new_row], ignore_index=True)
    else:
        print(f"Error {response.status_code}: {response.text}")
    
    # Now add the AlphaFold predicted structure row for this accession
    alphafold_id = f"AF-{uniprot_accession}-F1"
    alphafold_url = f"https://alphafold.ebi.ac.uk/files/{alphafold_id}.pdb"
    
    # Default details for AlphaFold; adjust if you want to parse the file
    alphafold_method = "Predicted"
    alphafold_resolution = ""
    alphafold_chain = ""
    alphafold_positions = ""
    
    # Optionally check if the AlphaFold file is available
    af_response = requests.get(alphafold_url)
    if af_response.status_code == 200:
        # You can add parsing logic here if needed
        pass
    
    new_row_af = pd.DataFrame({
        "Gene Symbol": [info_dict[uniprot_accession]["Gene Symbol"]],
        "Ensembl Gene ID": [info_dict[uniprot_accession]["Ensembl Gene ID"]],
        "Accession": [uniprot_accession],
        "Pfam IDs": [info_dict[uniprot_accession]["Pfam IDs"]],
        "Identifier": [alphafold_id],
        "Method": [alphafold_method],
        "Resolution": [alphafold_resolution],
        "Chain": [alphafold_chain],
        "Positions": [alphafold_positions]
    })
    df = pd.concat([df, new_row_af], ignore_index=True)

# Display the final DataFrame
print(df)

# Save the DataFrame to CSV
output_path = "/Users/nusin/Library/Mobile Documents/com~apple~CloudDocs/Desktop/IOV/3_Projects/PPI/1_Input_Proteins_pdbs/df_receptor_hmox1.csv"
df.to_csv(output_path, index=False)


Note: you may need to restart the kernel to use updated packages.
Error 204: 
   Gene Symbol  Ensembl Gene ID   Accession Pfam IDs        Identifier  \
0        HMOX1  ENSG00000100292      P09601  Pf01126              1N3U   
1        HMOX1  ENSG00000100292      P09601  Pf01126              1N45   
2        HMOX1  ENSG00000100292      P09601  Pf01126              1NI6   
3        HMOX1  ENSG00000100292      P09601  Pf01126              1OYK   
4        HMOX1  ENSG00000100292      P09601  Pf01126              1OYL   
5        HMOX1  ENSG00000100292      P09601  Pf01126              1OZE   
6        HMOX1  ENSG00000100292      P09601  Pf01126              1OZL   
7        HMOX1  ENSG00000100292      P09601  Pf01126              1OZR   
8        HMOX1  ENSG00000100292      P09601  Pf01126              1OZW   
9        HMOX1  ENSG00000100292      P09601  Pf01126              1S13   
10       HMOX1  ENSG00000100292      P09601  Pf01126              1S8C   
11       HMOX1  ENSG00000100292   

# Receptor: Download Structures


In [None]:
%pip install requests

import os
import requests
import pandas as pd

# ---------------------------
# Downloading Experimental (RCSB) PDB Files
# ---------------------------

# Directory to save the experimental RCSB PDB files
rcsb_output_dir = '/Users/nusin/Library/Mobile Documents/com~apple~CloudDocs/Desktop/IOV/3_Projects/PPI/1_Input_Proteins_pdbs/receptor_pdbs/rcsb'
os.makedirs(rcsb_output_dir, exist_ok=True)

# Base URL for RCSB PDB files
rcsb_download_base_url = "https://files.rcsb.org/download/"

# Function to download experimental PDB files from RCSB
def download_rcsb_pdb(identifier, output_directory):
    pdb_url = f"{rcsb_download_base_url}{identifier}.pdb"
    try:
        response = requests.get(pdb_url, timeout=10)
        response.raise_for_status()

        pdb_file_path = os.path.join(output_directory, f"{identifier}.pdb")
        with open(pdb_file_path, 'wb') as pdb_file:
            pdb_file.write(response.content)

        print(f"Downloaded (RCSB): {identifier}.pdb")
    except requests.exceptions.RequestException as e:
        print(f"Failed to download {identifier} from RCSB: {e}")

# Load DataFrame (note: using the CSV name from your first chunk)
file_path = '/Users/nusin/Library/Mobile Documents/com~apple~CloudDocs/Desktop/IOV/3_Projects/PPI/1_Input_Proteins_pdbs/df_receptor_hmox1.csv'
df_receptor_hmox1 = pd.read_csv(file_path)

# Create list of PDB IDs (excluding AlphaFold entries)
pdb_ids = df_receptor_hmox1['Identifier'][~df_receptor_hmox1['Identifier'].str.startswith("AF-")].tolist()

# Download experimental RCSB PDB files
for pdb_id in pdb_ids:
    download_rcsb_pdb(pdb_id, rcsb_output_dir)

# ---------------------------
# Downloading AlphaFold PDB Files
# ---------------------------

# Directory for AlphaFold PDB files
alphafold_output_dir = '/Users/nusin/Library/Mobile Documents/com~apple~CloudDocs/Desktop/IOV/3_Projects/PPI/1_Input_Proteins_pdbs/receptor_pdbs/alphafold'
os.makedirs(alphafold_output_dir, exist_ok=True)

# Base URL for AlphaFold files
alphafold_download_base_url = "https://alphafold.ebi.ac.uk/files/"

# Function to download AlphaFold PDB files
def download_alphafold_pdb(identifier, output_directory):
    # Append '-model_v4' so that the URL becomes, for example:
    # https://alphafold.ebi.ac.uk/files/AF-P09601-F1-model_v4.pdb
    pdb_url = f"{alphafold_download_base_url}{identifier}-model_v4.pdb"
    try:
        response = requests.get(pdb_url, timeout=10)
        response.raise_for_status()

        pdb_file_path = os.path.join(output_directory, f"{identifier}.pdb")
        with open(pdb_file_path, 'wb') as pdb_file:
            pdb_file.write(response.content)

        print(f"Downloaded (AlphaFold): {identifier}.pdb")
    except requests.exceptions.RequestException as e:
        print(f"Failed to download {identifier} from AlphaFold: {e}")

# Download AlphaFold PDB files (only for entries starting with "AF-")
for identifier in df_receptor_hmox1['Identifier']:
    if identifier.startswith("AF-"):
        download_alphafold_pdb(identifier, alphafold_output_dir)


Downloaded (RCSB): 1N3U.pdb
Downloaded (RCSB): 1N45.pdb
Downloaded (RCSB): 1NI6.pdb
Downloaded (RCSB): 1OYK.pdb
Downloaded (RCSB): 1OYL.pdb
Downloaded (RCSB): 1OZE.pdb
Downloaded (RCSB): 1OZL.pdb
Downloaded (RCSB): 1OZR.pdb
Downloaded (RCSB): 1OZW.pdb
Downloaded (RCSB): 1S13.pdb
Downloaded (RCSB): 1S8C.pdb
Downloaded (RCSB): 1T5P.pdb
Downloaded (RCSB): 1TWN.pdb
Downloaded (RCSB): 1TWR.pdb
Downloaded (RCSB): 1XJZ.pdb
Downloaded (RCSB): 1XK0.pdb
Downloaded (RCSB): 1XK1.pdb
Downloaded (RCSB): 1XK2.pdb
Downloaded (RCSB): 1XK3.pdb
Downloaded (RCSB): 3CZY.pdb
Downloaded (RCSB): 3HOK.pdb
Downloaded (RCSB): 3K4F.pdb
Downloaded (RCSB): 3TGM.pdb
Downloaded (RCSB): 4WD4.pdb
Downloaded (RCSB): 5BTQ.pdb
Downloaded (RCSB): 6EHA.pdb
Downloaded (AlphaFold): AF-P09601-F1.pdb
Downloaded (AlphaFold): AF-A0A7I2V3I1-F1.pdb


# Ligand: DF Experimental Structures (RCSB PDB)

some notes after troubleshooting

-do not ask requests from the website at once, queue the requests.

-check the resolution. extract the data too.

-do not create seperate rows for each chain ID but include them in one cell.

-include the names of the files without availabla sructures.

In [12]:
%pip install asyncio
%pip install aiohttp
%pip install openpyxl
import asyncio
import aiohttp
import pandas as pd
import warnings

# (Optional) Suppress openpyxl conditional formatting warning
warnings.filterwarnings("ignore", category=UserWarning, module="openpyxl")

# -----------------------------
# Step 1. Load and clean ligand data from the Excel file
# -----------------------------
ligand_file_path = "/Users/nusin/Library/Mobile Documents/com~apple~CloudDocs/Desktop/IOV/3_Projects/PPI/0_Proteins_of_Interest/list of 75 proteins of interest.xlsx"
ligand_df = pd.read_excel(ligand_file_path)
ligand_df.columns = ligand_df.columns.str.strip()
ligand_df['Accession'] = ligand_df['Accession'].str.replace(r'-1$', '', regex=True)

# -----------------------------
# Step 2. Define an async function to get the UniProt sequence length using the fasta endpoint
# -----------------------------
async def get_uniprot_sequence(accession, session):
    uniprot_url = f"https://www.uniprot.org/uniprot/{accession}.fasta"
    try:
        async with session.get(uniprot_url, timeout=10) as response:
            if response.status == 200:
                fasta_data = await response.text()
                # Skip the header (first line) and join the rest to get the full sequence
                sequence = "".join(fasta_data.splitlines()[1:])
                return len(sequence)
            else:
                print(f"Failed to fetch UniProt FASTA data for {accession}, Status code: {response.status}")
                return None
    except Exception as e:
        print(f"Error fetching UniProt FASTA data for {accession}: {e}")
        return None

# -----------------------------
# (Optional) If you need the UniProt JSON data for other purposes, you can still use this function.
# -----------------------------
async def get_uniprot_json_data(accession, session):
    url = f"https://rest.uniprot.org/uniprotkb/{accession}?format=json"
    try:
        async with session.get(url, timeout=10) as response:
            if response.status == 200:
                return await response.json()
            else:
                print(f"Failed to fetch UniProt JSON data for {accession}, Status code: {response.status}")
                return None
    except Exception as e:
        print(f"Error fetching UniProt JSON data for {accession}: {e}")
        return None

# -----------------------------
# Step 3. Define an async function to process one ligand
# -----------------------------
async def process_ligand(session, row):
    results_list = []
    gene_symbol = row["Gene Symbol"]
    ensembl_gene_id = row["Ensembl Gene ID"]
    accession = row["Accession"]
    pfam_ids = row["Pfam IDs"]

    print(f"Processing {accession}...")

    # Get UniProt sequence length using the FASTA endpoint
    seq_length = await get_uniprot_sequence(accession, session)
    if seq_length is None:
        print(f"Skipping {accession} due to missing UniProt sequence.")
        # Even if sequence length is missing, create a row with N/A for identifier
        results_list.append({
            "Gene Symbol": gene_symbol,
            "Ensembl Gene ID": ensembl_gene_id,
            "Accession": accession,
            "Pfam IDs": pfam_ids,
            "Identifier": "N/A",
            "Method": "N/A",
            "Resolution": "N/A",
            "Chain": "N/A",
            "Positions": "N/A",
            "Length": "N/A"
        })
        return results_list

    # (Optional) Get UniProt JSON data if needed for other details
    # uniprot_json = await get_uniprot_json_data(accession, session)
    # [You can process uniprot_json here if needed]

    # Build the JSON query for experimental structures (RCSB)
    query = {
        "query": {
            "type": "group",
            "logical_operator": "and",
            "nodes": [
                {
                    "type": "terminal",
                    "service": "text",
                    "parameters": {
                        "attribute": "rcsb_polymer_entity_container_identifiers.reference_sequence_identifiers.database_accession",
                        "operator": "exact_match",
                        "value": accession
                    }
                },
                {
                    "type": "terminal",
                    "service": "text",
                    "parameters": {
                        "attribute": "rcsb_polymer_entity_container_identifiers.reference_sequence_identifiers.database_name",
                        "operator": "exact_match",
                        "value": "UniProt"
                    }
                }
            ]
        },
        "return_type": "entry",
        "request_options": {"return_all_hits": True}
    }

    rcsb_query_url = "https://search.rcsb.org/rcsbsearch/v2/query"
    
    try:
        async with session.post(rcsb_query_url, json=query, timeout=10) as response:
            text = await response.text()
            if not text.strip():
                print(f"Empty response for accession {accession}. Skipping.")
                # Even if the response is empty, create a row with N/A for identifier
                results_list.append({
                    "Gene Symbol": gene_symbol,
                    "Ensembl Gene ID": ensembl_gene_id,
                    "Accession": accession,
                    "Pfam IDs": pfam_ids,
                    "Identifier": "N/A",
                    "Method": "N/A",
                    "Resolution": "N/A",
                    "Chain": "N/A",
                    "Positions": f"1-{seq_length}",
                    "Length": seq_length
                })
                return results_list
            result = await response.json()
    except Exception as e:
        print(f"Error querying RCSB for {accession}: {e}")
        # Even if there's an error querying, create a row with N/A for identifier
        results_list.append({
            "Gene Symbol": gene_symbol,
            "Ensembl Gene ID": ensembl_gene_id,
            "Accession": accession,
            "Pfam IDs": pfam_ids,
            "Identifier": "N/A",
            "Method": "N/A",
            "Resolution": "N/A",
            "Chain": "N/A",
            "Positions": f"1-{seq_length}",
            "Length": seq_length
        })
        return results_list

    pdb_ids = [entry["identifier"] for entry in result.get("result_set", [])]
    if not pdb_ids:
        print(f"No PDB IDs found for {accession}. Skipping.")
        # Even if no PDB IDs are found, create a row with N/A for identifier
        results_list.append({
            "Gene Symbol": gene_symbol,
            "Ensembl Gene ID": ensembl_gene_id,
            "Accession": accession,
            "Pfam IDs": pfam_ids,
            "Identifier": "N/A",
            "Method": "N/A",
            "Resolution": "N/A",
            "Chain": "N/A",
            "Positions": f"1-{seq_length}",
            "Length": seq_length
        })
        return results_list

    for pdb_id in pdb_ids:
        pdb_url = f"https://data.rcsb.org/rest/v1/core/entry/{pdb_id}"
        try:
            async with session.get(pdb_url, timeout=10) as pdb_response:
                pdb_data = await pdb_response.json()
                method = pdb_data.get("rcsb_entry_info", {}).get("experimental_method", "N/A")
                # Extract resolution using the "resolution_combined" field (as in the receptor code)
                resolution_list = pdb_data.get("rcsb_entry_info", {}).get("resolution_combined", ["N/A"])
                resolution = resolution_list[0]
                if resolution != "N/A":
                    resolution = f"{resolution} Å"

                polymer_entities = pdb_data.get("rcsb_entry_container_identifiers", {}).get("polymer_entity_ids", [])
                all_chains = []

                # Process each polymer entity to accumulate chain identifiers
                for entity_id in polymer_entities:
                    entity_url = f"https://data.rcsb.org/rest/v1/core/polymer_entity/{pdb_id}/{entity_id}"
                    try:
                        async with session.get(entity_url, timeout=10) as entity_response:
                            entity_data = await entity_response.json()
                            chain_list = entity_data.get("rcsb_polymer_entity_container_identifiers", {}).get("auth_asym_ids", [])
                            if chain_list:
                                all_chains.extend(chain_list)
                    except Exception as e:
                        print(f"Error fetching polymer entity {entity_id} for {pdb_id}: {e}")
                        continue

                # Combine all chains into a single string, separated by '/'
                chain = "/".join(all_chains) if all_chains else "N/A"

                # Check if the protein entry already exists in results_list
                existing_entry = next((res for res in results_list if res["Identifier"] == pdb_id), None)
                if existing_entry:
                    existing_entry["Chain"] += f"/{chain}" if chain != "N/A" else ""
                else:
                    results_list.append({
                        "Gene Symbol": gene_symbol,
                        "Ensembl Gene ID": ensembl_gene_id,
                        "Accession": accession,
                        "Pfam IDs": pfam_ids,
                        "Identifier": pdb_id,
                        "Method": method,
                        "Resolution": resolution,
                        "Chain": chain,
                        "Positions": f"1-{seq_length}",
                        "Length": seq_length
                    })
        except Exception as e:
            print(f"Error fetching PDB entry details for {pdb_id}: {e}")
            continue

    return results_list

# -----------------------------
# Wrap process_ligand with a semaphore to limit concurrent tasks
# -----------------------------
async def sem_process_ligand(semaphore, session, row):
    async with semaphore:
        return await process_ligand(session, row)

# -----------------------------
# Step 4. Process all ligands concurrently (limit to 10 concurrent tasks)
# -----------------------------
async def fetch_ligand_structures():
    all_results = []
    semaphore = asyncio.Semaphore(10)  # Limit concurrent tasks
    async with aiohttp.ClientSession() as session:
        tasks = []
        for idx, row in ligand_df.iterrows():
            tasks.append(sem_process_ligand(semaphore, session, row))
        ligands_results = await asyncio.gather(*tasks)
        for result in ligands_results:
            all_results.extend(result)
    return all_results

# -----------------------------
# Step 5. Main async function to get data and save results
# -----------------------------
async def main():
    ligand_results = await fetch_ligand_structures()
    df_ligand_structures = pd.DataFrame(ligand_results)
    print(df_ligand_structures.head())
    save_path = "/Users/nusin/Library/Mobile Documents/com~apple~CloudDocs/Desktop/IOV/3_Projects/PPI/1_Input_Proteins_pdbs/df_all_ligands.csv"
    df_ligand_structures.to_csv(save_path, index=False)
    print(f"Saved ligand structures to {save_path}")

# -----------------------------
# Run the async main function (e.g., in Jupyter, use await main())
# -----------------------------
await main()


Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.
Processing P01040...
Processing P07900...
Processing P11021...
Processing Q06830...
Processing P60709...
Processing P07355...
Processing Q8N163...
Processing Q9NX63...
Processing P13073...
Processing P21291...
Empty response for accession Q9NX63. Skipping.
Processing P68104...
Empty response for accession P21291. Skipping.
Processing P06733...
Processing P21333...
Processing P04406...
Processing P04899...
Processing Q92522...
Processing P16402...
Processing P16401...
Processing Q16777...
Processing Q7L7L0...
Processing Q5QNW6...
Processing P84243...
Processing P68431...
Processing Q71DI3...
Processing P62805...
Processing Q32P51...
Empty response for accession Q32P51. Skipping.
Processing P31943...
Processing Q00839...
Empty response for accession Q00839. Skipping.
Processing Q5SSJ5...
Proce

# Ligand: Filter Experimental Structures (RCSB PDB)

In [13]:
import pandas as pd

# Function to apply filters
def filter_ligand_structures(ligand_df):
    # Group by the "Accession" to handle duplicates and prioritize based on criteria
    grouped = ligand_df.groupby("Accession")

    filtered_results = []
    for accession, group in grouped:
        if len(group) <= 2:
            # If there are one or two rows, keep all of them
            filtered_results.append(group)
        else:
            # If there are more than 2 rows, prioritize by "Positions" length
            group["Position Length"] = group["Positions"].str.split('-').str[1].astype(int)
            max_position_length = group["Position Length"].max()

            # Filter out rows with the maximum position length
            max_position_rows = group[group["Position Length"] == max_position_length]

            if len(max_position_rows) > 2:
                # If there are still more than 2 rows with the same longest "Positions", prioritize by "Resolution"
                max_position_rows = max_position_rows.sort_values(by="Resolution", ascending=False)

            # Append exactly 2 entries, even if there are more after filtering
            filtered_results.append(max_position_rows.head(2))  # Only take the top 2

    # Concatenate all filtered results and reset index
    final_filtered_df = pd.concat(filtered_results).reset_index(drop=True)

    # Fill missing values with "N/A"
    final_filtered_df = final_filtered_df.fillna('N/A')

    return final_filtered_df

# -----------------------------
# Apply filters and save the results
# -----------------------------
def apply_filters_and_save():
    # Load the data from the specified CSV file
    file_path = "/Users/nusin/Library/Mobile Documents/com~apple~CloudDocs/Desktop/IOV/3_Projects/PPI/1_Input_Proteins_pdbs/df_all_ligands.csv"
    df_ligand_structures = pd.read_csv(file_path)
    
    filtered_df = filter_ligand_structures(df_ligand_structures)
    print(f"Filtered data:\n{filtered_df.head()}")
    
    # Save the filtered data to a new CSV file
    save_path = "/Users/nusin/Library/Mobile Documents/com~apple~CloudDocs/Desktop/IOV/3_Projects/PPI/1_Input_Proteins_pdbs/df_filtered_ligands.csv"
    filtered_df.to_csv(save_path, index=False)
    print(f"Saved filtered ligand structures to {save_path}")

# Apply the filters and save the results
apply_filters_and_save()


Filtered data:
  Gene Symbol  Ensembl Gene ID Accession          Pfam IDs Identifier Method  \
0      LGALS9  ENSG00000168961    O00182           Pf00337       3LSE  X-ray   
1      LGALS9  ENSG00000168961    O00182           Pf00337       3NV2  X-ray   
2      MYL12B  ENSG00000118680    O14950  Pf08976, Pf13499        N/A    N/A   
3        CSTA  ENSG00000121552    P01040           Pf00031       8GT0  X-ray   
4        CSTA  ENSG00000121552    P01040           Pf00031       8GT7  X-ray   

  Resolution        Chain Positions  Length Position Length  
0     2.69 Å            A     1-355     355           355.0  
1     2.34 Å            A     1-355     355           355.0  
2        N/A          N/A     1-172     172             N/A  
3     3.28 Å    A/C/E/B/D      1-98      98            98.0  
4     3.28 Å  A/C/B/D/E/F      1-98      98            98.0  
Saved filtered ligand structures to /Users/nusin/Library/Mobile Documents/com~apple~CloudDocs/Desktop/IOV/3_Projects/PPI/1_Input_Pro

# Ligand: Merge DF Experimental and Alphafold (RCSB PDB)


In [14]:
import requests
import pandas as pd

# --- Step 1: Read the CSV with experimental (real structure) data ---
exp_csv_path = "/Users/nusin/Library/Mobile Documents/com~apple~CloudDocs/Desktop/IOV/3_Projects/PPI/1_Input_Proteins_pdbs/df_filtered_ligands.csv"
exp_df = pd.read_csv(exp_csv_path)

# --- Step 2: Create a DataFrame with AlphaFold-predicted structure rows ---
# For each unique accession, check if the AlphaFold file exists.
unique_accessions = exp_df['Accession'].unique()
alphafold_rows = []

for accession in unique_accessions:
    # Retrieve common info from one sample row for this accession
    sample_row = exp_df[exp_df['Accession'] == accession].iloc[0]
    
    # Construct the AlphaFold identifier and URL
    af_id = f"AF-{accession}-F1"
    af_url = f"https://alphafold.ebi.ac.uk/files/{af_id}-model_v4.pdb"
    
    # Check if the AlphaFold file exists using a HEAD request.
    try:
        response = requests.head(af_url, timeout=10)
        if response.status_code == 200:
            method = "Predicted"
            identifier = af_id
        else:
            method = "No Predicted Structure"
            identifier = "No Predicted Structure"
    except Exception as e:
        print(f"Error checking AlphaFold for {accession}: {e}")
        method = "No Predicted Structure"
        identifier = "No Predicted Structure"
    
    # Use the 'Length' (if available) to form the Positions string.
    length = sample_row.get("Length", "")
    positions = f"1-{length}" if pd.notnull(length) and length != "" else ""
    
    alphafold_rows.append({
        "Gene Symbol": sample_row["Gene Symbol"],
        "Ensembl Gene ID": sample_row["Ensembl Gene ID"],
        "Accession": accession,
        "Pfam IDs": sample_row["Pfam IDs"],
        "Identifier": identifier,
        "Method": method,
        "Resolution": "",
        "Chain": "",
        "Positions": positions,
        "Length": length
    })

alphafold_df = pd.DataFrame(alphafold_rows)

# --- Step 3: Merge the experimental and AlphaFold DataFrames ---
merged_df = pd.concat([exp_df, alphafold_df], ignore_index=True)

# Sort the merged DataFrame by Accession (and optionally by Identifier)
merged_df = merged_df.sort_values(by=["Accession", "Identifier"])

# --- Step 4: Fill missing values with "N/A" ---
merged_df = merged_df.fillna('N/A')

# --- Step 5: Save the merged DataFrame to a new CSV ---
merged_csv_path = "/Users/nusin/Library/Mobile Documents/com~apple~CloudDocs/Desktop/IOV/3_Projects/PPI/1_Input_Proteins_pdbs/df_merged_ligands.csv"
merged_df.to_csv(merged_csv_path, index=False)

print("Merged DataFrame saved to:", merged_csv_path)


Merged DataFrame saved to: /Users/nusin/Library/Mobile Documents/com~apple~CloudDocs/Desktop/IOV/3_Projects/PPI/1_Input_Proteins_pdbs/df_merged_ligands.csv


# Ligand: Download Structures

*now to download the structures as pdb files*

For experimental data I am using: https://files.rcsb.org/download

For alphafold strcutures I am using: https://alphafold.ebi.ac.uk/files

Edit the csv, to show if it is downloaded: df_merged_ligands.csv

In [15]:
import os
import pandas as pd
import requests

# Paths
df_ligand_path = '/Users/nusin/Library/Mobile Documents/com~apple~CloudDocs/Desktop/IOV/3_Projects/PPI/1_Input_Proteins_pdbs/df_merged_ligands.csv'

# Output directories for ligand PDB files
rcsb_output_dir = '/Users/nusin/Library/Mobile Documents/com~apple~CloudDocs/Desktop/IOV/3_Projects/PPI/1_Input_Proteins_pdbs/ligand_pdbs/rcsb'
alphafold_output_dir = '/Users/nusin/Library/Mobile Documents/com~apple~CloudDocs/Desktop/IOV/3_Projects/PPI/1_Input_Proteins_pdbs/ligand_pdbs/alphafold'

# Create directories if they do not exist
os.makedirs(rcsb_output_dir, exist_ok=True)
os.makedirs(alphafold_output_dir, exist_ok=True)

# Load the ligand DataFrame
df_ligand = pd.read_csv(df_ligand_path)

# Base URLs
rcsb_download_base_url = "https://files.rcsb.org/download/"
alphafold_download_base_url = "https://alphafold.ebi.ac.uk/files/"

# Function to download PDB files from RCSB
def download_rcsb_pdb(identifier, output_directory):
    pdb_url = f"{rcsb_download_base_url}{identifier}.pdb"
    try:
        response = requests.get(pdb_url, timeout=10)
        response.raise_for_status()
        pdb_file_path = os.path.join(output_directory, f"{identifier}.pdb")
        with open(pdb_file_path, 'wb') as pdb_file:
            pdb_file.write(response.content)
        msg = f"Downloaded (RCSB): {identifier}.pdb"
        print(msg)
        return True, msg
    except requests.exceptions.RequestException as e:
        msg = f"Failed to download {identifier} from RCSB: {e}"
        print(msg)
        return False, msg

# Function to download AlphaFold PDB files
def download_alphafold_pdb(identifier, output_directory):
    pdb_url = f"{alphafold_download_base_url}{identifier}-model_v4.pdb"
    try:
        response = requests.get(pdb_url, timeout=10)
        response.raise_for_status()
        pdb_file_path = os.path.join(output_directory, f"{identifier}.pdb")
        with open(pdb_file_path, 'wb') as pdb_file:
            pdb_file.write(response.content)
        msg = f"Downloaded (AlphaFold): {identifier}.pdb"
        print(msg)
        return True, msg
    except requests.exceptions.RequestException as e:
        msg = f"Failed to download {identifier} from AlphaFold: {e}"
        print(msg)
        return False, msg

# Download PDB files and record the results
download_results = []
failed_downloads = []  # To store failed download identifiers

# Iterate over all unique identifiers in the DataFrame
for identifier in df_ligand['Identifier'].dropna().unique():
    if identifier.startswith("AF-"):
        success, message = download_alphafold_pdb(identifier, alphafold_output_dir)
    else:
        success, message = download_rcsb_pdb(identifier, rcsb_output_dir)
    
    # Mark which files failed
    if not success:
        failed_downloads.append(identifier)
    
    download_results.append((identifier, success, message))

# Ensure N/A rows are included and marked correctly
df_ligand['downloads'] = df_ligand.apply(
    lambda row: 'no' if pd.isna(row['Identifier']) or row['Identifier'] in failed_downloads else 'yes', axis=1
)

# Save the updated DataFrame, ensuring that N/A rows are maintained
df_ligand_path = "/Users/nusin/Library/Mobile Documents/com~apple~CloudDocs/Desktop/IOV/3_Projects/PPI/1_Input_Proteins_pdbs/df_downloaded_ligands.csv"
df_ligand.to_csv(df_ligand_path, index=False)

# Print failed downloads list
if failed_downloads:
    print("\nFiles that failed to download:")
    for failed_file in failed_downloads:
        print(failed_file)
else:
    print("\nAll files were downloaded successfully.")


Downloaded (RCSB): 3LSE.pdb
Downloaded (RCSB): 3NV2.pdb
Downloaded (AlphaFold): AF-O00182-F1.pdb
Downloaded (AlphaFold): AF-O14950-F1.pdb
Downloaded (RCSB): 8GT0.pdb
Downloaded (RCSB): 8GT7.pdb
Downloaded (AlphaFold): AF-P01040-F1.pdb
Downloaded (RCSB): 2YPT.pdb
Downloaded (RCSB): 7D9N.pdb
Downloaded (AlphaFold): AF-P02545-F1.pdb
Downloaded (RCSB): 3GPD.pdb
Downloaded (RCSB): 8DNS.pdb
Downloaded (AlphaFold): AF-P04406-F1.pdb
Downloaded (RCSB): 8B6L.pdb
Downloaded (RCSB): 8PN9.pdb
Downloaded (AlphaFold): AF-P04844-F1.pdb
Downloaded (RCSB): 6D9H.pdb
Failed to download 8Y45 from RCSB: 404 Client Error: Not Found for url: https://files.rcsb.org/download/8Y45.pdb
Downloaded (AlphaFold): AF-P04899-F1.pdb
Downloaded (AlphaFold): AF-P05783-F1.pdb
Downloaded (RCSB): 5LAX.pdb
Downloaded (RCSB): 8TRL.pdb
Downloaded (AlphaFold): AF-P06733-F1.pdb
Downloaded (RCSB): 8AH2.pdb
Downloaded (RCSB): 8AS5.pdb
Downloaded (AlphaFold): AF-P06748-F1.pdb
Downloaded (RCSB): 4DRW.pdb
Downloaded (RCSB): 5LQ2.pdb
D