In [1]:
import pandas as pd
import requests
from tqdm import tqdm

df = pd.read_csv("../data/subset.csv")

df["protein_id"] = ""
df["domain_start"] = None
df["domain_end"] = None
df["protein_sequence"] = ""

def get_uniprot_mapping(pdb_id, chain_id):

    #Get UniProt ID and coverage region for a given PDB ID and chain from PDBe API

    url = f"https://www.ebi.ac.uk/pdbe/api/mappings/uniprot/{pdb_id.lower()}"
    response = requests.get(url)
    if response.status_code != 200:
        print(f" Could not fetch mapping for {pdb_id}")
        return None

    data = response.json()
    mappings = data.get(pdb_id.lower(), {}).get("UniProt", {})

    for uniprot_id, details in mappings.items():
        for region in details.get("mappings", []):
            if region["chain_id"] == chain_id:
                return uniprot_id, int(region["start"]["residue_number"]), int(region["end"]["residue_number"])

    print(f" No mapping found for chain {chain_id} in {pdb_id}")
    return None


def get_uniprot_sequence(uniprot_id):
    #Get the full protein sequence from UniProt

    url = f"https://rest.uniprot.org/uniprotkb/{uniprot_id}.fasta"
    try:
        response = requests.get(url)
        if response.status_code != 200:
            print(f" Could not fetch UniProt sequence for {uniprot_id}")
            return ""
        lines = response.text.splitlines()
        return "".join(lines[1:])
    except Exception as e:
        print(f" Error fetching sequence for {uniprot_id}: {e}")
        return ""


for idx, row in tqdm(df.iterrows(), total=len(df)):
    domain_id = row["domain_id"]
    if len(domain_id) < 5:
        continue  # skip malformed IDs

    pdb_id = domain_id[:4]
    chain_id = domain_id[4]

    mapping = get_uniprot_mapping(pdb_id, chain_id)
    if mapping:
        uniprot_id, start, end = mapping
        sequence = get_uniprot_sequence(uniprot_id)

        df.at[idx, "protein_id"] = uniprot_id
        df.at[idx, "domain_start"] = start
        df.at[idx, "domain_end"] = end
        df.at[idx, "protein_sequence"] = sequence

df.to_csv("../data/subset_with_protein_info.csv", index=False)

  4%|▍         | 42/1000 [00:17<05:28,  2.91it/s]

 Could not fetch mapping for 4g54


  9%|▉         | 94/1000 [00:37<05:23,  2.80it/s]

 No mapping found for chain B in 4a5u


 13%|█▎        | 133/1000 [00:52<04:37,  3.13it/s]

 Could not fetch mapping for 4qre


 15%|█▍        | 146/1000 [00:58<04:40,  3.04it/s]

 Could not fetch mapping for 1vw4


 15%|█▌        | 153/1000 [01:01<04:46,  2.96it/s]

 Could not fetch mapping for 2k6l


 20%|██        | 203/1000 [01:21<04:10,  3.18it/s]

 Could not fetch mapping for 1orn


 22%|██▏       | 219/1000 [01:27<04:45,  2.73it/s]

 Could not fetch mapping for 6mh4


 22%|██▏       | 220/1000 [01:28<04:38,  2.80it/s]

 Could not fetch mapping for 2y1e


 22%|██▏       | 222/1000 [01:28<03:50,  3.38it/s]

 Could not fetch mapping for 3rlb


 24%|██▎       | 235/1000 [01:33<04:26,  2.87it/s]

 Could not fetch mapping for 6ep5


 26%|██▌       | 258/1000 [01:41<03:57,  3.12it/s]

 Could not fetch mapping for 4gns


 37%|███▋      | 371/1000 [02:28<03:11,  3.29it/s]

 Could not fetch mapping for 1vs9


 40%|████      | 402/1000 [02:41<03:26,  2.90it/s]

 Could not fetch mapping for 6dcm


 46%|████▌     | 460/1000 [03:06<03:41,  2.44it/s]

 Could not fetch mapping for 5yxg


 46%|████▋     | 463/1000 [03:07<02:57,  3.02it/s]

 Could not fetch mapping for 4eiu


 46%|████▋     | 465/1000 [03:08<03:17,  2.71it/s]

 Could not fetch mapping for 4fxt


 47%|████▋     | 468/1000 [03:09<02:50,  3.13it/s]

 Could not fetch mapping for 4iyk


 53%|█████▎    | 527/1000 [03:34<03:06,  2.53it/s]

 Could not fetch mapping for 2xsg


 54%|█████▎    | 535/1000 [03:36<02:38,  2.93it/s]

 Could not fetch mapping for 4kca


 56%|█████▌    | 558/1000 [03:45<02:23,  3.07it/s]

 No mapping found for chain I in 4u1e


 56%|█████▌    | 562/1000 [03:47<02:42,  2.69it/s]

 Could not fetch mapping for 5z9t


 59%|█████▊    | 587/1000 [03:57<02:02,  3.36it/s]

 Could not fetch mapping for 7vc6


 59%|█████▉    | 591/1000 [03:58<02:18,  2.95it/s]

 Could not fetch mapping for 4ush


 81%|████████  | 808/1000 [05:25<00:57,  3.32it/s]

 Could not fetch mapping for 2g5x


 82%|████████▏ | 819/1000 [05:30<01:28,  2.05it/s]

 Could not fetch mapping for 3cpx


 83%|████████▎ | 826/1000 [05:32<00:53,  3.27it/s]

 Could not fetch mapping for 3n5m


 87%|████████▋ | 874/1000 [05:51<00:35,  3.54it/s]

 Could not fetch mapping for 5cdk
 Could not fetch mapping for 3p9d


 88%|████████▊ | 879/1000 [05:52<00:34,  3.52it/s]

 Could not fetch mapping for 4d8q


 90%|████████▉ | 896/1000 [06:00<00:35,  2.90it/s]

 Could not fetch mapping for 4hst


 91%|█████████▏| 913/1000 [06:06<00:39,  2.19it/s]

 Could not fetch mapping for 3o6n


 96%|█████████▌| 957/1000 [06:23<00:14,  2.95it/s]

 Could not fetch mapping for 7znp


 98%|█████████▊| 976/1000 [06:30<00:07,  3.15it/s]

 Could not fetch mapping for 1vw4


 98%|█████████▊| 980/1000 [06:31<00:04,  4.16it/s]

 Could not fetch mapping for 4a17
 Could not fetch mapping for 1vw4


100%|██████████| 1000/1000 [06:39<00:00,  2.50it/s]
