In [None]:
from Bio import SeqIO
import requests
import sys
import os

def search_pdb(sequence_id):
    url = f"https://search.rcsb.org/rcsbsearch/v2/query?json={{\"query\":{{\"type\":\"terminal\",\"service\":\"text\",\"parameters\":{{\"attribute\":\"rcsb_entity_source_organism.ncbi_taxonomy_id\",\"operator\":\"exact_match\",\"value\":\"{sequence_id}\"}}}},\"return_type\":\"polymer_entity\"}}"
    headers = {"Content-Type": "application/json"}
    response = requests.get(url, headers=headers)
    if response.status_code == 200:
        results = response.json()
        if results.get('result_set'):
            for result in results['result_set']:
                pdb_id = result['identifier'].split('_')[0]
                return pdb_id
    return None

def download_pdb(pdb_id, output_dir="pdb_files"):
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)
    
    url = f"https://files.rcsb.org/download/{pdb_id}.pdb"
    response = requests.get(url)
    if response.status_code == 200:
        with open(f"{output_dir}/{pdb_id}.pdb", "w") as file:
            file.write(response.text)
        print(f"Downloaded PDB file for {pdb_id}")
        return True
    return False

def main(fasta_file):
    count = 0
    for record in SeqIO.parse(fasta_file, "fasta"):
        count = count + 1
        sequence_id = record.id.split('.')[0]  # Consider only what is before the point
        print(f"Searching {count} PDB for sequence ID : {sequence_id}")
        pdb_id = search_pdb(sequence_id)
        if pdb_id:
            print(f"Found PDB ID: {pdb_id}")
            if download_pdb(pdb_id):
                print(f"Successfully downloaded PDB for sequence ID: {sequence_id}")
                return
        else:
            print(f"No PDB found for sequence ID: {sequence_id}")

    print("No PDB files found for any sequences in the file.")
    sys.exit(1)

if __name__ == "__main__":
    fasta_file = "curated_dataset/sequences.fasta"
    main(fasta_file)


Searching 1 PDB for sequence ID : A0A1Y2ANA8
No PDB found for sequence ID: A0A1Y2ANA8
Searching 2 PDB for sequence ID : V3ZBP4
No PDB found for sequence ID: V3ZBP4
Searching 3 PDB for sequence ID : A0A226EIY0
No PDB found for sequence ID: A0A226EIY0
Searching 4 PDB for sequence ID : A0A6J1SJU2
No PDB found for sequence ID: A0A6J1SJU2
Searching 5 PDB for sequence ID : Q54ST1
No PDB found for sequence ID: Q54ST1
Searching 6 PDB for sequence ID : A0A0Q9YK82
No PDB found for sequence ID: A0A0Q9YK82
Searching 7 PDB for sequence ID : A0A6M0QXA2
No PDB found for sequence ID: A0A6M0QXA2
Searching 8 PDB for sequence ID : A0A2M6GI39
No PDB found for sequence ID: A0A2M6GI39
Searching 9 PDB for sequence ID : A1ZEV9
No PDB found for sequence ID: A1ZEV9
Searching 10 PDB for sequence ID : A0A6S4QMA3
No PDB found for sequence ID: A0A6S4QMA3
Searching 11 PDB for sequence ID : A0A2D6BQU1
No PDB found for sequence ID: A0A2D6BQU1
Searching 12 PDB for sequence ID : A0A494T8I3
No PDB found for sequence ID: 