In [1]:
""" Program takes list of protein identifiers "3d.list" and retrieves their protein and nucleotide sequences 
from the protein database webpage "http://uniprot.org"
""" 

' Program takes list of protein identifiers "3d.list" and retrieves their protein and nucleotide sequences \nfrom the protein database webpage "http://uniprot.org"\n'

In [2]:
from bs4 import BeautifulSoup
import requests
import threading
import queue
import time

In [3]:
# URL constants
START_URL = 'https://www.uniprot.org/uniprot/'
new_url1 = 'https://www.ebi.ac.uk/ena/data/view/'
new_url2 = '&display=xml'

In [4]:
def worker(queue_ids, queue_sequences):
    """ Get UniProtIds from queue_ids and put to queue_sequences sequences of proteins and CDS
    """
    while not queue_ids.empty():
        # Get next UniProtId
        uniprot_id = queue_ids.get()
        url = START_URL + uniprot_id
        res = requests.get(url)
        soup = BeautifulSoup(res.text, "lxml")
        # Find all CDS ids
        t = soup.find_all('a',{'class':"embl_cds"})

        # Put all CDS ids into CDS list. 
        cds = []
        for i in t:
            n = i.get_text()
            new = n[:-2]
            cds.append(new)

        # In rear cases CDS records are absent then we skip to the next UniProtId
        if cds == []:
            continue

        # Retrieve web-page with CDS
        # In this vertion of program we are going to use just the first CDS, 
        # but in the future we are going to use all of them.
        cds_url = new_url1 + cds[0] + new_url2
        cds_res = requests.get(cds_url)
        cds_soup = BeautifulSoup(cds_res.text, "lxml")

        # Amino acid sequence is under the field 'value', the last instance
        cds_t = cds_soup.find_all('value')
        if len(cds_t) == 0:
            continue
        aa_sequence = ''.join(cds_t[-1].get_text().split())
        nucl_sequence = ''.join(cds_soup.find('sequence').get_text().split())
        queue_sequences.put((uniprot_id, aa_sequence, nucl_sequence))

In [None]:
# Number of threads
num_threads = 32

# Counting time
start = time.time()

# Here we open the list of proteins IDs which we are interested in
fh = open("3d.list","r")
readFile = fh.read()
uniprot_ids = readFile.split()
fh.close()

# Initializing queues
queue_ids = queue.Queue()
queue_sequences = queue.Queue()

# Get list of UniProtIds from the file. Here the list is already given
for uniprot_id in uniprot_ids:
    queue_ids.put(uniprot_id)

# Starting threads 
threads = []
for i in range(num_threads):
    t = threading.Thread(target=worker, args=(queue_ids, queue_sequences))
    t.start()
    threads.append(t)
    
# Guarantee that all threads are finished
for t in threads:
    t.join()

# Print results in cvs format
outFile = open("idProteinNucleotide.csv","w")
outId = []
while not queue_sequences.empty():
    uniprot_id, aa_sequence, nucl_sequence = queue_sequences.get()
    print("{0}, {1}, {2}".format(uniprot_id, aa_sequence, nucl_sequence), file = outFile)
    outId.append(uniprot_id)
outFile.close()

# Reporting time
print("Working time:", round(time.time() - start), "seconds")