In [9]:
#test_genes = ['LOC102163816', 'TTN', 'LOC100513133', 'LOC110257246', 'NEBL', 'ABLIM1', 'CTDSP1', 'ANKRD1', 'MYBPC3', 'L1TD1', 'RPP30']

with open('./Data/absolute_genes.txt') as f:
  contents = f.read()
  genes = contents.split("\n")
  genes_set1 = genes[0:5000]
  genes_set2 = genes[5000:10000]
  genes_set3 = genes[10001:15000]
  genes_set4 = genes[15001:20000]
  genes_set5 = genes[20001: len(genes)]  
  print(genes)

['LOC102163816', 'TTN', 'LOC100513133', 'LOC110257246', 'NEBL', 'ABLIM1', 'CTDSP1', 'ANKRD1', 'MYBPC3', 'L1TD1', 'RPP30', 'SLC8A1', 'TPM1', 'MYH7', 'TNNI3', 'DDX5', 'LOC110256529', 'RBPMS', 'PFKFB1', 'ZFAT', 'SORBS2', 'DDX17', 'RYR2', 'DLC1', 'SORBS1', 'TANC2', 'MYL2', 'ZBTB20', 'PDE4D', 'TCP11', 'LOC102162486', 'LOC106504879', 'CMYA5', 'REL', 'AAK1', 'RBM24', 'TPM2', 'LOC100516731', 'PDLIM5', 'LOC110255408', 'LOC100624559', 'GLS', 'LOC102164640', 'LUC7L3', 'LOC100626591', 'SPTBN1', 'CRYBG3', 'CD36', 'FKBP14', 'TXNIP', 'FHL2', 'ZNF671', 'TCEANC2', 'ACTC1', 'ACTA1', 'ATP2A2', 'LACC1', 'LCMT2', 'LOC110260761', 'LOC106508546', 'C12H17orf100', 'LOC110261249', 'B3GALNT1', 'PNISR', 'ALG11', 'NKTR', 'MFSD4A', 'SON', 'ATXN7', 'PSPC1', 'TRA2A', 'LOC110255584', 'CPSF4L', 'MYH7B', 'QKI', 'MARS2', 'PNPT1', 'LOC110257572', 'COL3A1', 'NEMP2', 'CAMK2D', 'GHR', 'PPARGC1A', 'SLC25A4', 'AKAP8L', 'COL1A1', 'KLF8', 'TRIT1', 'TNRC6B', 'TNNT2', 'LOC106504370', 'TAPBP', 'TIA1', 'TAF1D', 'GSN', 'WWOX', 'NEXN'

In [6]:
len(genes)

20473

In [10]:
import re, time, json, zlib
from xml.etree import ElementTree
from urllib.parse import urlparse, parse_qs, urlencode
import requests
from requests.adapters import HTTPAdapter, Retry

POLLING_INTERVAL = 3

API_URL = "https://rest.uniprot.org"

retries = Retry(total=5, backoff_factor=0.25, status_forcelist=[500, 502, 503, 504])
session_UniProtAPI = requests.Session()
session_UniProtAPI.mount("https://", HTTPAdapter(max_retries=retries))


def submit_id_mapping_uniprot_api(from_db, to_db, ids):
    '''
    FUNCITON:
    - This submits the post request to UniProt's API.
    
    PARAMS:
    - from_db (string): The database to map IDs from
    - to_db (string): The database to map IDs to
    - ids (list of strings): The IDs to map from 
    '''
    request = requests.post(
        f"{API_URL}/idmapping/run",
        data={"from": from_db, "to": to_db, "ids": ",".join(ids)},
    )
    request.raise_for_status()
    return request.json()["jobId"]



def check_id_mapping_results_ready_uniprot_api(job_id):
    '''
    FUNCTION:
    - This checks if the submitted job is ready.
      If the job is not ready, it will tell you and try again.
      If the job is ready, it will tell you it is ready.
      
    PARAMS:
    - job_id: This is the ID of the job submitted to UniProt
    '''
    while True:
        request = session_UniProtAPI.get(f"{API_URL}/idmapping/status/{job_id}")
        request.raise_for_status()
        j = request.json()
        if "jobStatus" in j:
            if j["jobStatus"] == "RUNNING":
                print(f"Job still running. Retrying in {POLLING_INTERVAL}s")
                time.sleep(POLLING_INTERVAL)
            else:
                raise Exception(request["jobStatus"])
        else:
            return bool(j["results"] or j["failedIds"])
    


def get_id_mapping_results_link_uniprot_api(job_id):
    '''
    FUNCTION:
    - This gets the link where the job results can
      be accessed and then later downloaded by another
      function here.
    
    PARAMS:
    - job_id: This is the ID of the job submitted to UniProt
    '''
    url = f"{API_URL}/idmapping/details/{job_id}"
    request = session_UniProtAPI.get(url)
    request.raise_for_status()
    return request.json()["redirectURL"]


def get_id_mapping_results_search_UniProtAPI(url):
    '''
    FUNCTION:
    - Download the API results from a url
    
    PARAMS:
    - url: the link where the job results can
      be accessed and downloaded here.
    '''
    parsed = urlparse(url)
    query = parse_qs(parsed.query)
    file_format = query["format"][0] if "format" in query else "json"
    if "size" in query:
        size = int(query["size"][0])
    else:
        size = 500
        query["size"] = size
    compressed = (
        query["compressed"][0].lower() == "true" if "compressed" in query else False
    )
    parsed = parsed._replace(query=urlencode(query, doseq=True))
    url = parsed.geturl()
    request = session_UniProtAPI.get(url)
    request.raise_for_status()
    results = decode_results_uniprot_api(request, file_format, compressed)
    total = int(request.headers["x-total-results"])
    print_progress_batches_uniprot_api(0, size, total)
    for i, batch in enumerate(get_batch_uniprot_api(request, file_format, compressed), 1):
        results = combine_batches_uniprot_api(results, batch, file_format)
        print_progress_batches_uniprot_api(i, size, total)
    if file_format == "xml":
        return merge_xml_results_uniprot_api(results)
    return results


def get_next_link_uniprot_api(headers):
    re_next_link = re.compile(r'<(.+)>; rel="next"')
    if "Link" in headers:
        match = re_next_link.match(headers["Link"])
        if match:
            return match.group(1)



def get_batch_uniprot_api(batch_response, file_format, compressed):
    batch_url = get_next_link_uniprot_api(batch_response.headers)
    while batch_url:
        batch_response = session_UniProtAPI.get(batch_url)
        batch_response.raise_for_status()
        yield decode_results_uniprot_api(batch_response, file_format, compressed)
        batch_url = get_next_link_uniprot_api(batch_response.headers)


def get_xml_namespace_uniprot_api(element):
    m = re.match(r"\{(.*)\}", element.tag)
    return m.groups()[0] if m else ""


def merge_xml_results_uniprot_api(xml_results):
    merged_root = ElementTree.fromstring(xml_results[0])
    for result in xml_results[1:]:
        root = ElementTree.fromstring(result)
        for child in root.findall("{http://uniprot.org/uniprot}entry"):
            merged_root.insert(-1, child)
    ElementTree.register_namespace("", get_xml_namespace_uniprot_api(merged_root[0]))
    return ElementTree.tostring(merged_root, encoding="utf-8", xml_declaration=True)


def combine_batches_uniprot_api(all_results, batch_results, file_format):
    if file_format == "json":
        for key in ("results", "failedIds"):
            if key in batch_results and batch_results[key]:
                all_results[key] += batch_results[key]
    elif file_format == "tsv":
        return all_results + batch_results[1:]
    else:
        return all_results + batch_results
    return all_results


def print_progress_batches_uniprot_api(batch_index, size, total):
    n_fetched = min((batch_index + 1) * size, total)
    print(f"Fetched: {n_fetched} / {total}", end='\r')


def decode_results_uniprot_api(response, file_format, compressed):
    if compressed:
        decompressed = zlib.decompress(response.content, 16 + zlib.MAX_WBITS)
        if file_format == "json":
            j = json.loads(decompressed.decode("utf-8"))
            return j
        elif file_format == "tsv":
            return [line for line in decompressed.decode("utf-8").split("\n") if line]
        elif file_format == "xlsx":
            return [decompressed]
        elif file_format == "xml":
            return [decompressed.decode("utf-8")]
        else:
            return decompressed.decode("utf-8")
    elif file_format == "json":
        return response.json()
    elif file_format == "tsv":
        return [line for line in response.text.split("\n") if line]
    elif file_format == "xlsx":
        return [response.content]
    elif file_format == "xml":
        return [response.text]
    return response.text


def convert_ncbi_gene_to_uniprot(genes, debug=False, return_mappings=False):
    genes = set(genes)

    # Convert UniProt IDs to STRING IDs
    job_id = submit_id_mapping_uniprot_api(
        from_db='Gene_Name',
        to_db='UniProtKB',
        ids=genes)

    # Wait until result finished.
    if check_id_mapping_results_ready_uniprot_api(job_id):
        link = get_id_mapping_results_link_uniprot_api(job_id)
        results = get_id_mapping_results_search_UniProtAPI(link)

    # Parse results
    geneISuniprot, uniprotISgene = dict(), dict()

    # result_list[0]['to']['organism']['taxonId']
    for u2s in results['results']:
        gene = u2s['from']
        uniprot = u2s['to']['primaryAccession']

        # if '9606' in string:  # TODO fix hard-coded. TODO this is human specific
        if 9606 == u2s['to']['organism']['taxonId']:
            uniprotISgene[uniprot] = gene
            geneISuniprot[gene] = uniprot

    if debug:
        print(len(geneISuniprot), '/', len(genes), 'genes aligned with UniProt')

    return geneISuniprot, uniprotISgene


#geneISuniprot,_ = convert_ncbi_gene_to_uniprot(genes, debug=True)

In [None]:
#geneISuniprot

In [None]:
#genes_set1
chunked_list_1 = list()
chunk_size = 50
counter = 1
for i in range(0, len(genes_set1), chunk_size):
  geneISuniprot,_ = convert_ncbi_gene_to_uniprot(genes_set1[i:i+chunk_size], debug=True)
  print(counter)
  chunked_list_1.append(geneISuniprot)
  counter = counter + 1

#saving all protein names into one list
protein_list_1 = []

for n in chunked_list_1:
  for i in n.values():
    protein_list_1.append(i)

#putting all proteins into a text file
with open('proliferation_regeneration_prots_1.txt', 'w') as f:
  for element in protein_list_1:
      f.write(element + '\n')

In [None]:
#genes_set2
chunked_list_2 = list()
chunk_size = 50
counter = 1
for i in range(0, len(genes_set2), chunk_size):
  geneISuniprot,_ = convert_ncbi_gene_to_uniprot(genes_set2[i:i+chunk_size], debug=True)
  print(counter)
  chunked_list_2.append(geneISuniprot)
  counter = counter + 1

#saving all protein names into one list
protein_list_2 = []

for n in chunked_list_2:
  for i in n.values():
    protein_list_2.append(i)

#putting all proteins into a text file
with open('proliferation_regeneration_prots_2.txt', 'w') as f:
  for element in protein_list_2:
      f.write(element + '\n')

In [None]:
#genes_set3
chunked_list_3 = list()
chunk_size = 50
counter = 1
for i in range(0, len(genes_set3), chunk_size):
  geneISuniprot,_ = convert_ncbi_gene_to_uniprot(genes_set3[i:i+chunk_size], debug=True)
  print(counter)
  chunked_list_3.append(geneISuniprot)
  counter = counter + 1

#saving all protein names into one list
protein_list_3 = []

for n in chunked_list_3:
  for i in n.values():
    protein_list_3.append(i)

#putting all proteins into a text file
with open('proliferation_regeneration_prots_3.txt', 'w') as f:
  for element in protein_list_3:
      f.write(element + '\n')

In [11]:
#genes_set4
chunked_list_4 = list()
chunk_size = 50
counter = 1
for i in range(0, len(genes_set4), chunk_size):
  geneISuniprot,_ = convert_ncbi_gene_to_uniprot(genes_set4[i:i+chunk_size], debug=True)
  print(counter)
  chunked_list_4.append(geneISuniprot)
  counter = counter + 1

#saving all protein names into one list
protein_list_4 = []

for n in chunked_list_4:
  for i in n.values():
    protein_list_4.append(i)

#putting all proteins into a text file
with open('proliferation_regeneration_prots_4.txt', 'w') as f:
  for element in protein_list_4:
      f.write(element + '\n')

32 / 50 genes aligned with UniProt
1
Fetched: 168460 / 168460

TypeError: string indices must be integers

In [None]:
#genes_set5
chunked_list_5 = list()
chunk_size = 50
counter = 1
for i in range(0, len(genes_set5), chunk_size):
  geneISuniprot,_ = convert_ncbi_gene_to_uniprot(genes_set5[i:i+chunk_size], debug=True)
  print(counter)
  chunked_list_5.append(geneISuniprot)
  counter = counter + 1

#saving all protein names into one list
protein_list_5 = []

for n in chunked_list_5:
  for i in n.values():
    protein_list_5.append(i)

#putting all proteins into a text file
with open('proliferation_regeneration_prots_5.txt', 'w') as f:
  for element in protein_list_5:
      f.write(element + '\n')