In [1]:
import requests

In [2]:
def rest_query(server, query, content_type):
    """
    Function to perform a REST API query.

    Args:
    - server        Server to query.
    - Query         Query that is passed to server.
    - content_type  Content type requested from the server.

    Returns server output.
    """

    r = requests.get(server + query, headers={"Content-Type": content_type})

    if not r.ok:
        raise RuntimeError(
            f"{server} returned error status code {r.status_code}. "
            "Please double-check arguments and try again.\n"
        )

    if content_type == "application/json":
        return r.json()
    else:
        return r.text

In [9]:
gene_id = "ENSG00000157764"

content_type = "application/json"
server = "https://api.ncbi.nlm.nih.gov/datasets/v1"
query = f"/gene/id/{gene_id}/download_summary"

rest_query(server, query, content_type)

RuntimeError: https://api.ncbi.nlm.nih.gov/datasets/v1 returned error status code 400. Please double-check arguments and try again.


In [71]:
import urllib
import pandas as pd
from io import StringIO

import numpy as np
import logging

In [74]:
def get_uniprot_info(server, ensembl_id, id_type):
    """
    Retrieve UniProt synonyms and description based on Ensemsbl identifiers.

    Args:
    - server          Link to UniProt REST API server.
    - ensembl_id      Ensembl ID (str).
    - id_type         "Gene" or "Transcript"

    Returns data frame with UniProt ID, gene name, organism, sequence, sequence length, and query ID.
    """

    if id_type == "Gene":
        ens_id_type = "ENSEMBL_ID"
    elif id_type == "Transcript":
        ens_id_type = "ENSEMBL_TRS_ID"
    else:
        logging.warning(
            f"Ensembl_ID '{ensembl_id}' was not recognized as either gene nor transcript. Gene name synonyms and description will not be fetched from UniProt."
        )
        return

    # Define query arguments
    # Columns documentation: https://www.uniprot.org/help/uniprotkb%5Fcolumn%5Fnames
    # from/to IDs documentation: https://www.uniprot.org/help/api_idmapping
    query_args = {
        "from": ens_id_type,
        "to": "ACC",
        "format": "tab",
        "query": ensembl_id,
        "columns": "id,genes(PREFERRED),genes,protein names,comment(FUNCTION),organism,reviewed",
    }
    # Reformat query arguments
    query_args = urllib.parse.urlencode(query_args)
    query_args = query_args.encode("ascii")

    # Submit query to UniProt server
    request = urllib.request.Request(server, query_args)

    # Read and clean up results
    with urllib.request.urlopen(request) as response:
        res = response.read()

    # Check if URL retruned error code
    if response.getcode() != 200:
        raise RuntimeError(
            f"The UniProt server returned error status code {response.getcode()}. Please try again."
        )

    # Initiate data frame so empty df will be returned if no matches are found
    df = pd.DataFrame()

    try:
        # This will throw an EmptyDataError if no results were found
        df = pd.read_csv(StringIO(res.decode("utf-8")), sep="\t")

        if len(df.columns) == 8:
            # Rename columns
            df.columns = [
                "uniprot_id",
                "primary_gene_name",
                "synonyms",
                "protein_names",
                "uniprot_description",
                "organism",
                "status",
                "query",
            ]
        # Sometimes an extra "isomap" column is returned.
        if len(df.columns) == 9:
            # Drop isomap column (last column)
            df = df.iloc[:, :-1]
            # Rename columns
            df.columns = [
                "uniprot_id",
                "primary_gene_name",
                "synonyms",
                "protein_names",
                "uniprot_description",
                "organism",
                "status",
                "query",
            ]
        try:
            # Split gene names into list of strings
            df["synonyms"] = df["synonyms"].str.split(" ")
        except:
            None

        # If there are reviewed results, return only reviewed results
        if "reviewed" in df["status"].values:
            logging.info("Returning only reviewed UniProt results.")
            # Only keep rows where status is "reviewed"
            df = df[df.status == "reviewed"]

        else:
            logging.info("No reviewed UniProt results were found. Returning all unreviewed results.")

        # Return set of all results if more than one UniProt ID was found for this Ensembl ID
        final_df = pd.DataFrame()
        for column in df.columns:
            if column == "synonyms":
                # Flatten synonym lists
                syn_lists = df[column].values
                flat_list = [item for sublist in syn_lists for item in sublist]
                final_df[column] = [list(set(flat_list))]
            else:
                final_df[column] = [list(set(df[column].values))]

    # If no results were found, return None
    except pd.errors.EmptyDataError:
        return None

    return final_df

In [75]:
get_uniprot_info("https://www.uniprot.org/uploadlists/", "ENSG00000157764", "Gene")

Unnamed: 0,uniprot_id,primary_gene_name,synonyms,protein_names,uniprot_description,organism,status,query
0,[P15056],[BRAF],"[BRAF, BRAF1, RAFB1]",[Serine/threonine-protein kinase B-raf (EC 2.7...,[FUNCTION: Protein kinase involved in the tran...,[Homo sapiens (Human)],[reviewed],[ENSG00000157764]
