In [1]:
import os
import json
from pprint import pprint

import requests
import xml.etree.ElementTree as ET
from tenacity import retry, stop_after_attempt, wait_fixed, retry_if_exception_type

In [2]:
def check_item(item):

    if item is None:
        return "NaN"
    else:
        return item.text


@retry(
    stop=stop_after_attempt(6),
    wait=wait_fixed(3),
    retry=retry_if_exception_type(
        requests.RequestException
    ),
)
def call_esearch(gene_name, max_uids, sort_uids):

    base_url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi"

    my_api_key = "your_api_key************************"

    params = {
        "db": "gene",
        "retmode": "json",
        "retmax": max_uids,
        "term": f"{gene_name}[GENE] AND Homo sapiens[ORGN]",
        "sort": sort_uids,
        "api_key": my_api_key,
    }

    response = requests.get(base_url, params=params)
    response.raise_for_status()

    return response


@retry(
    stop=stop_after_attempt(6),
    wait=wait_fixed(3),
    retry=retry_if_exception_type(requests.RequestException),
)
def call_efetch(gene_id):

    base_url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi"

    my_api_key = "your_api_key************************"

    params = {"db": "gene", "id": gene_id, "retmode": "xml", "api_key": my_api_key}

    response = requests.get(base_url, params=params)
    response.raise_for_status()

    return response

In [3]:
# Examples:
all_genes = ["HER1", "T", "C1orf179"]
gene_id_dict = {}

for gene in all_genes:
    try:
        response1 = call_esearch(gene, max_uids=25, sort_uids="relevance")

        ids_list = response1.json()["esearchresult"]["idlist"]
        candidate_gene_id = []

        if len(ids_list) == 0:
            print(
                f"For gene {gene}, the exact Gene ID was not found in NCBI!" + "\n\n"
            )
            with open("genes_None.txt", "a+") as f:
                f.write(gene + "\n")
            continue

        elif len(ids_list) == 1:
            gene_id = ids_list[0]

            response2 = call_efetch(gene_id)
            root = ET.fromstring(response2.text)
            discontinued_flag = root.find(
                ".//Entrezgene_track-info/Gene-track/Gene-track_discontinue-date"
            )

            if discontinued_flag is None:
                print(
                    f"For gene {gene}, a unique Gene ID: {ids_list[0]} was found in NCBI"
                    + "\n\n"
                )
            else:
                print(
                    f"The only Gene ID {gene_id} of gene {gene} was found, but it was Discontinued by NCBI!"
                    + "\n\n"
                )
                with open("genes_NCBI_Discontinued_1.txt", "a+") as f:
                    f.write(f"{gene}__{gene_id}" + "\n")
                continue

        elif len(ids_list) > 1:
            print(
                f"For gene {gene}, {len(ids_list)} Gene IDs: {ids_list} was found in NCBI"
                + "\n\n"
            )
            gene_id = ids_list[0]

            for check_gene_id in ids_list:
                response2 = call_efetch(check_gene_id)
                root = ET.fromstring(response2.text)

                discontinued_flag = root.find(
                    ".//Entrezgene_track-info/Gene-track/Gene-track_discontinue-date"
                )
                if discontinued_flag is None:
                    print(
                        f"Check the Gene ID {check_gene_id} for the gene {gene}..." + "\n\n"
                    )
                else:
                    print(
                        f"For the Gene ID {check_gene_id} of gene {gene}, it has been Discontinued by NCBI!"
                        + "\n\n"
                    )
                    with open("genes_NCBI_Discontinued_2.txt", "a+") as f:
                        f.write(f"{gene}__{check_gene_id}" + "\n")
                    continue

                official_symbol = root.findall(
                    ".//Entrezgene_gene/Gene-ref/Gene-ref_locus"
                )

                gene_aliases = root.findall(
                    ".//Entrezgene_gene/Gene-ref/Gene-ref_syn/Gene-ref_syn_E"
                )

                if gene_aliases == []:
                    gene_aliases = {}
                else:
                    gene_aliases = {g.text for g in gene_aliases}

                if official_symbol == []:
                    official_symbol = []
                else:
                    official_symbol = [o.text for o in official_symbol]

                if not official_symbol:
                    print(
                        f"For Gene ID {check_gene_id} of gene {gene}, Official Symbol of gene was not found!"
                        + "\n\n"
                    )
                elif len(official_symbol) > 1:
                    print(
                        f"For the Gene ID {check_gene_id} of the gene {gene}, there are multiple Official Symbol!"
                        + "\n\n"
                    )
                    official_symbol = official_symbol[0]
                else:
                    official_symbol = official_symbol[0]

                if gene == official_symbol:
                    candidate_gene_id = []
                    gene_id = check_gene_id
                    break
                else:
                    if gene in gene_aliases:
                        candidate_gene_id.append(check_gene_id)

        if (
            len(candidate_gene_id) > 0
        ):
            gene_id = candidate_gene_id[0]

        gene_id_dict[gene] = gene_id
        print(
            f"For Entrez Gene ID {gene_id} of gene {gene}, conversion to Gene ID succeeded!"
            + "\n\n"
        )

    except Exception as e:
        print(
            f"After 6 attempts, the transformation of gene {gene} to Gene ID failed. Error: {e}" + "\n\n"
        )
        with open("genes_None.txt", "a+") as f:
            f.write(gene + "\n")

print(f"The dict that converts Gene Symbol to Entrez Gene ID:\n{gene_id_dict}\n")
for gene, id_ in gene_id_dict.items():
    print(f"{gene} -> {id_}")

For gene HER1, a unique Gene ID: 1956 was found in NCBI


For Entrez Gene ID 1956 of gene HER1, conversion to Gene ID succeeded!


For gene T, a unique Gene ID: 6862 was found in NCBI


For Entrez Gene ID 6862 of gene T, conversion to Gene ID succeeded!


For gene C1orf179, a unique Gene ID: 338094 was found in NCBI


For Entrez Gene ID 338094 of gene C1orf179, conversion to Gene ID succeeded!


The dict that converts Gene Symbol to Entrez Gene ID:
{'HER1': '1956', 'T': '6862', 'C1orf179': '338094'}

HER1 -> 1956
T -> 6862
C1orf179 -> 338094


In [4]:
os.makedirs("./genes_xml_tmp", exist_ok=True)
gene_info_dict = {}

for gene_name, gene_id in gene_id_dict.items():

    try:
        response = call_efetch(gene_id)
        gene_xml = response.text
        with open(f"./genes_xml_tmp/{gene_name}__{gene_id}.xml", "w") as f:
            f.write(gene_xml)

        root = ET.fromstring(gene_xml)

        official_symbol = root.find(".//Entrezgene_gene/Gene-ref/Gene-ref_locus").text

        gene_aliases = root.findall(
            ".//Entrezgene_gene/Gene-ref/Gene-ref_syn/Gene-ref_syn_E"
        )
        if gene_aliases == []:
            gene_aliases = []
        else:
            gene_aliases = [g.text for g in gene_aliases]

        gene_type = root.find(".//Entrezgene_type")
        if gene_type is None:
            gene_type = "NaN"
        else:
            gene_type = gene_type.attrib["value"]

        description = check_item(root.find(".//Entrezgene_gene/Gene-ref/Gene-ref_desc"))

        gene_summary = check_item(root.find(".//Entrezgene_summary"))

        gene_info_dict[gene_id] = {
            "official_symbol": official_symbol,
            "description": description,
            "gene_type": gene_type,
            "summary_info": gene_summary,
            "gene_aliases": gene_aliases,
        }

        check_gene_in = [official_symbol] + gene_aliases
        decision_ = gene_name in set(check_gene_in)
        print(f"Check the gene {gene_name} is included in {check_gene_in}: {decision_}")
        print(
            f"For gene {official_symbol}, Gene ID: {gene_id}, key information was successfully obtained!" + "\n\n"
        )

    except Exception as e:
        print(
            f"After 6 attempts, the gene {gene_name} | Gene ID is {gene_id}, the query for related information failed. Error: {e}"
            + "\n\n"
        )
        print(response.text)
        with open("failed_genes_id.txt", "a+") as f:
            f.write(f"{gene_name}__{gene_id}" + "\n")

print("The dict that contains gene information:\n")
pprint(gene_info_dict)

Check the gene HER1 is included in ['EGFR', 'ERBB', 'ERRP', 'HER1', 'mENA', 'ERBB1', 'NNCIS', 'PIG61', 'NISBD2']: True
For gene EGFR, Gene ID: 1956, key information was successfully obtained!


Check the gene T is included in ['TBXT', 'T', 'TFT', 'SAVA']: True
For gene TBXT, Gene ID: 6862, key information was successfully obtained!


Check the gene C1orf179 is included in ['FAM151A', 'C1orf179']: True
For gene FAM151A, Gene ID: 338094, key information was successfully obtained!


The dict that contains gene information:

{'1956': {'description': 'epidermal growth factor receptor',
          'gene_aliases': ['ERBB',
                           'ERRP',
                           'HER1',
                           'mENA',
                           'ERBB1',
                           'NNCIS',
                           'PIG61',
                           'NISBD2'],
          'gene_type': 'protein-coding',
          'official_symbol': 'EGFR',
          'summary_info': 'The protein encoded b