In [10]:
import requests
import time
import os
from Bio import SeqIO
import re
import urllib.parse
import pandas as pd

### Get Data

In [11]:
#Dictionary of protein names and uniprot queries
query_dict = {
    "Phospholipase A2(PLA2)": "(taxonomy_id:8570) AND (venom OR toxin) AND (protein_name:phospholipase OR protein_name:pla2)",
    "Snake Venom Metalloproteinases(SVMP)": "(taxonomy_id:8570) AND (venom OR toxin) AND (protein_name:Metalloproteinase OR protein_name:SVMP)",
    "Disintegrins": "(taxonomy_id:8570) AND (venom OR toxin) AND (protein_name:disintegrin)",
    "Snake Venom Serine Proteases(SVSP)": "(taxonomy_id:8570) AND (venom OR toxin) AND (protein_name:Serine Protease OR protein_name:SVSP)",
    "Three-Finger Toxins(3FTX)": "(taxonomy_id:8570) AND (venom OR toxin) AND (protein_name:three-finger OR protein_name:3ftx)",
    "Cysteine-Rich Secretory Proteins(CRISP)": "(taxonomy_id:8570) AND (venom OR toxin) AND (protein_name:cysteine-rich OR protein_name:CRISP)",
    "Kunitz-Type Protease Inhibitors": "(taxonomy_id:8570) AND (venom OR toxin) AND (protein_name:kunitz)",
    "L-Amino Acid Oxidases(LAAO)": "(taxonomy_id:8570) AND (venom OR toxin) AND (protein_name:L-Amino Acid Oxidase OR protein_name:LAAO)",
    "Nerve Growth Factor(NGF)": "(taxonomy_id:8570) AND (venom OR toxin) AND (protein_name:nerve growth factor OR protein_name:NGF)",
    "Vascular Endothelial Growth Factor(VEGF)": "(taxonomy_id:8570) AND (venom OR toxin) AND (protein_name:Vascular Endothelial Growth Factor OR protein_name:VEGF)",
    "Bradykinin-Potentiating Peptides(BPP)": "(taxonomy_id:8570) AND (venom OR toxin) AND (protein_name:Bradykinin-Potentiating Peptide OR protein_name:BPP)",
    "Natriuretic Peptides(NP)": "(taxonomy_id:8570) AND (venom OR toxin) AND (protein_name:Natriuretic Peptide OR protein_name:NP)",
    "C-Type Lectins or Lectin-Like Proteins": "(taxonomy_id:8570) AND (venom OR toxin) AND (protein_name:C-Type Lectin OR protein_name:Lectin-Like Protein OR protein_name:CTL)",
    "5'-Nucleotidases": "(taxonomy_id:8570) AND (venom OR toxin) AND (protein_name:5'-Nucleotidase OR protein_name:5'NT)",
    "Hyaluronidases": "(taxonomy_id:8570) AND (venom OR toxin) AND (protein_name:Hyaluronidase)",
    "Phosphodiesterases (PDE)": "(taxonomy_id:8570) AND (venom OR toxin) AND (protein_name:Phosphodiesterase OR protein_name:PDE)",
    "Neurotrophins (other than NGF)": "(taxonomy_id:8570) AND (venom OR toxin) AND (protein_name:Neurotrophin)",
    "Glutaminyl Cyclase(QC)": "(taxonomy_id:8570) AND (venom OR toxin) AND (protein_name:Glutaminyl Cyclase OR protein_name:QC)",
    "Beta-bungarotoxin": "(taxonomy_id:8570) AND (venom OR toxin) AND (protein_name:Beta-bungarotoxin)",
    "Ohanin": "(taxonomy_id:8570) AND (venom OR toxin) AND (protein_name:Ohanin)",
    "Vespryns": "(taxonomy_id:8570) AND (venom OR toxin) AND (protein_name:Vespryn)",
    "Waprins": "(taxonomy_id:8570) AND (venom OR toxin) AND (protein_name:Waprin)",
    "Sarafotoxins": "(taxonomy_id:8570) AND (venom OR toxin) AND (protein_name:Sarafotoxin)",
    "Taicatoxin": "(taxonomy_id:8570) AND (venom OR toxin) AND (protein_name:Taicatoxin)",
    "Convulxin": "(taxonomy_id:8570) AND (venom OR toxin) AND (protein_name:Convulxin)"
}

#### Collect data with Uniprot Rest API

In [12]:
base_url = "https://rest.uniprot.org/uniprotkb/search"
max_size = 500  # max results per page

for protein_name, query in query_dict.items():
    print(f"Fetching sequences for {protein_name}...")
    all_results = []
    cursor = None
    total_count = 0
    
    while True:
        params = {
            "query": query,
            "format": "fasta",
            "size": max_size,
        }
        if cursor:
            params["cursor"] = cursor
        
        response = requests.get(base_url, params=params)
        if response.status_code != 200:
            print(f"Failed to retrieve data for {protein_name}: {response.status_code}")
            break
        
        data = response.text
        if not data.strip():
            break
        
        all_results.append(data)
        
        # Get 'x-total-results' header to know total
        if total_count == 0:
            total_count = int(response.headers.get("x-total-results", "0"))
        
        # Check if next page cursor is present in Link header
        link_header = response.headers.get("Link", "")
        next_cursor = None
        for link_part in link_header.split(","):
            if 'rel="next"' in link_part:
                # Example link_part: <https://rest.uniprot.org/uniprotkb/search?query=...&cursor=xyz>; rel="next"
                url_start = link_part.find("<") + 1
                url_end = link_part.find(">", url_start)
                url = link_part[url_start:url_end]
                parsed = urllib.parse.urlparse(url)
                qs = urllib.parse.parse_qs(parsed.query)
                cursor_list = qs.get("cursor")
                if cursor_list:
                    next_cursor = cursor_list[0]
                break
                
        if not next_cursor:
            break  # no more pages
        cursor = next_cursor
        
        time.sleep(1)  # be kind to server
    
    if all_results:
        fasta_text = "\n".join(all_results)
        filename = os.path.join("raw_data", f"{protein_name.replace(' ', '_')}.fasta")
        with open(filename, "w") as f:
            f.write(fasta_text)
        print(f"Saved {total_count} sequences for {protein_name} to {filename}")
    else:
        print(f"No sequences found for {protein_name}")


Fetching sequences for Phospholipase A2(PLA2)...
Saved 1735 sequences for Phospholipase A2(PLA2) to raw_data/Phospholipase_A2(PLA2).fasta
Fetching sequences for Snake Venom Metalloproteinases(SVMP)...
Saved 1378 sequences for Snake Venom Metalloproteinases(SVMP) to raw_data/Snake_Venom_Metalloproteinases(SVMP).fasta
Fetching sequences for Disintegrins...
Saved 599 sequences for Disintegrins to raw_data/Disintegrins.fasta
Fetching sequences for Snake Venom Serine Proteases(SVSP)...
Saved 1102 sequences for Snake Venom Serine Proteases(SVSP) to raw_data/Snake_Venom_Serine_Proteases(SVSP).fasta
Fetching sequences for Three-Finger Toxins(3FTX)...
Saved 668 sequences for Three-Finger Toxins(3FTX) to raw_data/Three-Finger_Toxins(3FTX).fasta
Fetching sequences for Cysteine-Rich Secretory Proteins(CRISP)...
Saved 382 sequences for Cysteine-Rich Secretory Proteins(CRISP) to raw_data/Cysteine-Rich_Secretory_Proteins(CRISP).fasta
Fetching sequences for Kunitz-Type Protease Inhibitors...
Saved 394

#### Parse .fasta Files
Create metadata.csv and protein_sequences.csv

In [13]:
def parse_uniprot_fasta_header(header):
    parts = header.split(' ', 1)
    uniprot_part = parts[0]  # e.g. sp|W8EFS0|V5NTD_MACLB
    rest = parts[1] if len(parts) > 1 else ""

    db, accession, uniprot_id = (uniprot_part.split('|') + ["", ""])[:3]

    title = rest.split(" OS=")[0].strip() if " OS=" in rest else rest.strip()

    os_match = re.search(r'OS=([^=]+?) (OX=|PE=|SV=|$)', rest)
    ox_match = re.search(r'OX=(\d+)', rest)
    pe_match = re.search(r'PE=(\d+)', rest)
    
    organism = os_match.group(1).strip() if os_match else ""
    taxonomy_id = ox_match.group(1).strip() if ox_match else ""
    evidence_level = pe_match.group(1).strip() if pe_match else ""

    return {
        "database": db,
        "database_id": accession,
        "uniprot_id": uniprot_id,
        "title": title,
        "organism": organism,
        "taxonomy_id": taxonomy_id,
        "evidence_level": evidence_level,
    }

metadata_all = []
sequences_all = []
directory = "raw_data"

for filename in os.listdir(directory):
    if filename.endswith(".fasta"):
        protein_name = os.path.splitext(filename)[0]
        filepath = os.path.join(directory, filename)
        
        for record in SeqIO.parse(filepath, "fasta"):
            meta = parse_uniprot_fasta_header(record.description)
            meta["protein"] = protein_name
            metadata_all.append(meta)
            
            sequences_all.append({
                "uniprot_id": meta["uniprot_id"],
                "protein_name": protein_name,
                "protein_sequence": str(record.seq)
            })

df_meta = pd.DataFrame(metadata_all)
df_seq = pd.DataFrame(sequences_all)

df_meta.to_csv("raw_data/metadata/metadata.csv", index=False)
df_seq.to_csv("raw_data/protein_sequences/protein_sequences.csv", index=False)


print("Metadata saved to metadata.csv")
print("Protein sequences saved to protein_sequences.csv")


Metadata saved to metadata.csv
Protein sequences saved to protein_sequences.csv


#### Inspect metadata.csv and protein_sequences.csv

In [29]:
metadata_df = pd.read_csv("raw_data/metadata/metadata.csv")
metadata_df.head()

Unnamed: 0,database,database_id,uniprot_id,title,organism,taxonomy_id,evidence_level,protein
0,sp,B3EWP2,HYAL_CRODU,Hyaluronidase CdtHya1 (Fragment),Crotalus durissus terrificus,8732,1,Hyaluronidases
1,sp,J3S820,HYAL_CROAD,Hyaluronidase,Crotalus adamanteus,8729,1,Hyaluronidases
2,sp,A3QVN2,HYAL_ECHOC,Hyaluronidase,Echis ocellatus,99586,2,Hyaluronidases
3,sp,A3QVN6,HYAL_ECHPL,Hyaluronidase,Echis pyramidum leakeyi,38415,2,Hyaluronidases
4,sp,A3QVP0,HYAL2_BITAR,Hyaluronidase-2,Bitis arietans,8692,2,Hyaluronidases


In [30]:
sequences_df = pd.read_csv("raw_data/protein_sequences/protein_sequences.csv")
sequences_df.head()

Unnamed: 0,uniprot_id,protein_name,protein_sequence
0,HYAL_CRODU,Hyaluronidases,MQAKAPMYPNEPFLVFWNAPTTQCRLRYKVDLDLNTFHIVTNAR
1,HYAL_CROAD,Hyaluronidases,MYHLWIKCLAAWIFLKRFNGVHVMQAKAPMYPNEPFLVFWNAPTTQ...
2,HYAL_ECHOC,Hyaluronidases,MYHLWIKCLAAWIFLKRFNGVHVMHAKAPMYPNEPFLVFWNAPTTQ...
3,HYAL_ECHPL,Hyaluronidases,MYHIWIKFLAAWIFLKRFNGVHVMQAKAPMYRNEPFLVFWNAPTTQ...
4,HYAL2_BITAR,Hyaluronidases,MYHLWIKCLAAWIFLKRCNGVHAMPAKAPMYPNEPFIVLWNAPTTQ...


In [31]:
#Compare their shapes
print(metadata_df.shape)
print(sequences_df.shape)

(8838, 8)
(8838, 3)


In [32]:
metadata_df.groupby('protein').count()

Unnamed: 0_level_0,database,database_id,uniprot_id,title,organism,taxonomy_id,evidence_level
protein,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
5'-Nucleotidases,140,140,140,140,140,140,140
Beta-bungarotoxin,41,41,41,41,41,41,41
Bradykinin-Potentiating_Peptides(BPP),95,95,95,95,95,95,95
C-Type_Lectins_or_Lectin-Like_Proteins,873,873,873,873,873,873,873
Convulxin,3,3,3,3,3,3,3
Cysteine-Rich_Secretory_Proteins(CRISP),382,382,382,382,382,382,382
Disintegrins,599,599,599,599,599,599,599
Glutaminyl_Cyclase(QC),28,28,28,28,28,28,28
Hyaluronidases,132,132,132,132,132,132,132
Kunitz-Type_Protease_Inhibitors,394,394,394,394,394,394,394


In [33]:
sequences_df.groupby('protein_name').count()

Unnamed: 0_level_0,uniprot_id,protein_sequence
protein_name,Unnamed: 1_level_1,Unnamed: 2_level_1
5'-Nucleotidases,140,140
Beta-bungarotoxin,41,41
Bradykinin-Potentiating_Peptides(BPP),95,95
C-Type_Lectins_or_Lectin-Like_Proteins,873,873
Convulxin,3,3
Cysteine-Rich_Secretory_Proteins(CRISP),382,382
Disintegrins,599,599
Glutaminyl_Cyclase(QC),28,28
Hyaluronidases,132,132
Kunitz-Type_Protease_Inhibitors,394,394
