In [2]:
from bs4 import BeautifulSoup
import requests
import re
import pandas as pd
import numpy as np
import urllib
from io import StringIO
from IPython.display import display, HTML
import logging

UNIPROT_REST_API = "https://rest.uniprot.org/uniprotkb/search?query="

In [5]:
def get_uniprot_seqs(server, ensembl_ids):
    """
    Retrieve UniProt sequences based on Ensemsbl, WormBase or FlyBase identifiers.

    Args:
    - server        Link to UniProt REST API server.
    - ensembl_ids   One or more Ensembl, WormBase or FlyBase IDs (string or list of strings).

    Returns data frame with UniProt ID, gene name, organism, sequence, sequence length, and query ID.
    """

    # If a single UniProt ID is passed as string, convert to list
    if type(ensembl_ids) == str:
        ensembl_ids = [ensembl_ids]

    # Initiate data frame so empty df will be returned if no matches are found
    master_df = pd.DataFrame()

    for id_ in ensembl_ids:
        # API documentation: https://www.uniprot.org/help/api_queries
        # Submit server request
        r = requests.get(server + id_ + "+AND+reviewed:true")
        if not r.ok:
            logging.error(
                f"UniProt server request returned with error status code: {r.status_code}. Please double-check arguments or try again later."
            )
        # Convert to json
        json = r.json()

        # If no reviewed results were found, try again for unreviewed results
        if not len(json["results"]) > 0:
            # Submit server request
            r = requests.get(server + id_)
            if not r.ok:
                logging.error(
                    f"UniProt server request returned with error status code: {r.status_code}. Please double-check arguments or try again later."
                )
            # Convert to json
            json = r.json()

            # Warn user if unreviewed results were found
            if len(json["results"]) > 0:
                logging.warning(
                    f"No reviewed UniProt results were found for ID {id_}. Returning all unreviewed results."
                )

        if len(json["results"]) > 0:
            # Convert results to data frame
            df = pd.json_normalize(json["results"])

            # Remove non-relevant columns
            df = df[
                [
                    "primaryAccession",
                    "organism.scientificName",
                    "sequence.value",
                    "sequence.length",
                ]
            ]

            # Rename columns
            df.columns = [
                "uniprot_id",
                "organism",
                "sequence",
                "sequence_length",
            ]

            # Add gene name and query columns
            gene_names = []
            for i in np.arange(len(json["results"])):
                try:
                    gene_names.append(
                        json["results"][i]["genes"][0]["geneName"]["value"]
                    )
                except:
                    gene_names.append(np.NaN)
            df["gene_name"] = gene_names
            df["query"] = id_

            # Append results for this ID to master data frame
            master_df = pd.concat([master_df, df], axis=0)

        else:
            # If no results were found, warn user and do nothing -> returns empty df
            logging.warning(f"No UniProt sequences were found for ID {id_}.")

    return master_df

In [6]:
get_uniprot_seqs(
            UNIPROT_REST_API, ["ENST00000392653.3", "ENST00000392657.7"]
        )

Unnamed: 0,uniprot_id,organism,sequence,sequence_length,gene_name,query
0,P35326,Homo sapiens,MSYQQQQCKQPCQPPPVCPTPKCPEPCPPPKCPEPCPPPKCPQPCP...,72,SPRR2A,ENST00000392653.3
0,A7KAX9,Homo sapiens,METESESSTLGDDSVFWLESEVIIQVTDCEEEEREEKFRKMKSSVH...,2087,ARHGAP32,ENST00000392657.7
