In [None]:
import pathlib
from uniProt.protein import Protein, get_protein
from uniProt.rdfgraph import RDFGraph
from uniProt.location import Location, get_location
from uniProt.api import QueryParameters, query_proteins, get_protein_graph, get_location_graph
from uniProt.mapping import LocationIDs, TaxonomyIDs
from uniProt.localcopy import LocalCopy

In [None]:
def print_protein_info(prot: Protein):
    print(f"Name: {prot.recommended_name}")
    print(f"UniProt ID: {prot.uniprot_id}")
    print(f"Gene: {prot.gene}")
    print(f"Description: {prot.description}")
    print(f"Number of Isoforms: {len(prot.isoforms)}")
    for isoform in prot.isoforms:
        print(f"-- {isoform}")
    print(f"Number of Known Subcellular Locations: {len(prot.locations)}")
    for location in prot.locations:
        print(f"--{location.label}")
    print(f"Protein is classified by {len(prot.keywords)} keywords.")
    for keyword in prot.keywords:
        print(f"--{keyword.label} ({keyword.keyword_type.value})")


def print_location_info(loc: Location):
    print(f"Label: {loc.label}")
    print(f"UniProt ID: {loc.uniprot_id}")
    print(f"Description: {loc.comment}")
    if loc.subclass is not None:
        sc = get_location(loc.subclass)
        print(f"Subclass of {sc.label} (ID: {loc.subclass})")
    print(f"Location part of {len(loc.part_of)} locations")
    for l_id in loc.part_of:
        l = get_location(l_id)
        print(f"--{l.label} (ID: {l_id})")
    
        
def print_rdf(rdf: RDFGraph):
    for s, p, o in rdf:
        print(s, p, o)

Using this library, you can get data from UniProt as a `Protein` object using the UniProt ID.

In [None]:
vsvg_id = 'P03522'
vsvg = get_protein(vsvg_id)
print_protein_info(vsvg)

In [None]:
gfp_id = 'P42212'
gfp = get_protein(gfp_id)
print_protein_info(gfp)

The `Protein` class only has a subset of data from UniProt. You can also get the entire UniProt entry as an `RDFGraph`. An `RDF` or Resource Description Framework is a family of World Wide Web Consortium specifications originally designed as a metadata data model. The `RDFGraph` contains semantic triples (subject, predicate, object). 

In [None]:
vsvg_rdf = get_protein_graph(vsvg_id)
gfp_rdf = get_protein_graph(gfp_id)
print_rdf(vsvg_rdf)

Similarly you can also get subcellular location as a `Location` object via the UniProt ID of the location.

In [None]:
virion_membrane_id = '275'
er_id = LocationIDs.ENDOPLASMIC_RETICULUM.value
virion_membrane = get_location(virion_membrane_id)
er = get_location(er_id)

In [None]:
print_location_info(virion_membrane)

In [None]:
print_location_info(er)

You can also store a local copy of the entries from UniProt using the `LocalCopy` class. The constructor of the `LocalCopy` accepts a filepath that contains downloaded copies of the entries from UniProt as well as an option to save copies of entries that are not in the local copy.

In [None]:
fp = pathlib.Path('.localuniprot')
lc = LocalCopy(filepath = fp, save_copy = True)

In [None]:
local_vsvg = lc.get_protein(vsvg_id)
print_protein_info(local_vsvg)

In [None]:
local_er = lc.get_location(er_id)
print_location_info(local_er)

You can also query UniProt to get a list of UniProt Protein Ids. Currently you can query Proteins based on `LocationIDs` and `TaxonomyIDs` available in `uniProt.mapping`. The following example queries UniProt to get IDs of Proteins found in the `Human` genome and are located in either `Endoplastic Reticulum` or `Nucleus`.

In [None]:
taxonomies = [TaxonomyIDs.HUMAN]
locations = [LocationIDs.ENDOPLASMIC_RETICULUM, LocationIDs.NUCLEUS]
params = QueryParameters(reviewed = True, taxonomies = taxonomies, locations = locations)
query_proteins(params)