## Uniprot REST API to extract Protein Data

In [166]:
#!pip install xmlschema

### [Programmatic access - Retrieving entries via queries](https://www.uniprot.org/help/api_queries)


In [161]:
"""
Queries Uniprot database and retrieves protein data.
"""
import urllib.request
from lxml import etree
import json
import xmlschema


#### REST api sample queries


* UniProtKB results for 'insulin' with the default columns in tab-separated format:

```
url0 = 'https://www.uniprot.org/uniprot/?query=insulin&sort=score&columns=id,\
entry name,reviewed,protein names,genes,organism,length&format=tab'
```

* All reviewed human entries:

```
url1 ="https://www.uniprot.org/uniprot/?query=reviewed:yes+AND+organism:9606"

```

* Specific protein bases entires- specified by data file type

```
url2 ="https://www.uniprot.org/uniprot/P12345.fasta"
url3 ="https://www.uniprot.org/uniprot/P12345.xml"
url4 ="https://www.uniprot.org/uniprot/P12345.txt"
url5 ="https://www.uniprot.org/uniprot/P12345.rdf"
url6 ="https://www.uniprot.org/uniprot/P12345.gff"
```

#### Query to search a single Protein detail

In [162]:
protein = 'P12345'
format = '.xml'
url = 'https://www.uniprot.org/uniprot/'+protein+ format
        
with urllib.request.urlopen(url) as r:
    raw_data = r.read().strip()

In [164]:
tree = etree.fromstring(raw_data)

#### Implement xml schema

In [168]:
schema = xmlschema.XMLSchema('https://www.uniprot.org/docs/uniprot.xsd')

In [169]:
entry_dict = schema.to_dict(tree)
content = entry_dict['entry'][0]
content['protein']

{'alternativeName': [{'fullName': 'Fatty acid-binding protein',
   'shortName': ['FABP-1']},
  {'fullName': 'Glutamate oxaloacetate transaminase 2'},
  {'fullName': 'Kynurenine aminotransferase 4'},
  {'fullName': 'Kynurenine aminotransferase IV'},
  {'fullName': 'Kynurenine--oxoglutarate transaminase 4'},
  {'fullName': 'Kynurenine--oxoglutarate transaminase IV'},
  {'fullName': 'Plasma membrane-associated fatty acid-binding protein',
   'shortName': ['FABPpm']},
  {'fullName': 'Transaminase A'}],
 'recommendedName': {'ecNumber': ['2.6.1.1', '2.6.1.7'],
  'fullName': 'Aspartate aminotransferase, mitochondrial',
  'shortName': ['mAspAT']}}

In [170]:
content['accession']

['P12345', 'G1SKL2']

In [171]:
content['gene']

[{'name': [{'$': 'GOT2', '@type': 'primary'}]}]

In [172]:
content['organism']

{'dbReference': [{'@id': '9986', '@type': 'NCBI Taxonomy'}],
 'lineage': {'taxon': ['Eukaryota',
   'Metazoa',
   'Chordata',
   'Craniata',
   'Vertebrata',
   'Euteleostomi',
   'Mammalia',
   'Eutheria',
   'Euarchontoglires',
   'Glires',
   'Lagomorpha',
   'Leporidae',
   'Oryctolagus']},
 'name': [{'$': 'Oryctolagus cuniculus', '@type': 'scientific'},
  {'$': 'Rabbit', '@type': 'common'}]}

In [173]:
content['reference']

[{'@key': '1',
  'citation': {'@date': '2009-08',
   '@db': 'EMBL/GenBank/DDBJ databases',
   '@type': 'submission',
   'authorList': {'consortium': [{'@name': 'The Genome Sequencing Platform'}],
    'person': [{'@name': 'Di Palma F.'},
     {'@name': 'Heiman D.'},
     {'@name': 'Young S.'},
     {'@name': 'Gnerre S.'},
     {'@name': 'Johnson J.'},
     {'@name': 'Lander E.S.'},
     {'@name': 'Lindblad-Toh K.'}]},
   'title': 'Genome Sequence of Oryctolagus cuniculus (European rabbit).'},
  'scope': ['NUCLEOTIDE SEQUENCE [LARGE SCALE GENOMIC DNA]'],
  'source': {'strain': ['Thorbecke']}},
 {'@key': '2',
  'citation': {'@date': '1985',
   '@first': '1337',
   '@last': '1345',
   '@name': 'J. Biochem.',
   '@type': 'journal article',
   '@volume': '97',
   'authorList': {'person': [{'@name': 'Kuramitsu S.'},
     {'@name': 'Inoue K.'},
     {'@name': 'Kondo K.'},
     {'@name': 'Aki K.'},
     {'@name': 'Kagamiyama H.'}]},
   'dbReference': [{'@id': '4030726', '@type': 'PubMed'},
    

In [175]:
content['comment']

[{'@type': 'function',
  'text': [{'$': 'Catalyzes the irreversible transamination of the L-tryptophan metabolite L-kynurenine to form kynurenic acid (KA). As a member of the malate-aspartate shuttle, it has a key role in the intracellular NAD(H) redox balance. Is important for metabolite exchange between mitochondria and cytosol, and for amino acid metabolism. Facilitates cellular uptake of long-chain free fatty acids.',
    '@evidence': [2]}]},
 {'@type': 'catalytic activity',
  'reaction': {'dbReference': [{'@id': 'RHEA:21824', '@type': 'Rhea'},
    {'@id': 'CHEBI:16452', '@type': 'ChEBI'},
    {'@id': 'CHEBI:16810', '@type': 'ChEBI'},
    {'@id': 'CHEBI:29985', '@type': 'ChEBI'},
    {'@id': 'CHEBI:29991', '@type': 'ChEBI'},
    {'@id': '2.6.1.1', '@type': 'EC'}],
   'text': '2-oxoglutarate + L-aspartate = L-glutamate + oxaloacetate'}},
 {'@type': 'catalytic activity',
  'reaction': {'dbReference': [{'@id': 'RHEA:20964', '@type': 'Rhea'},
    {'@id': 'CHEBI:16810', '@type': 'ChEBI'

In [176]:
content['sequence']

{'$': 'MALLHSARVLSGVASAFHPGLAAAASARASSWWAHVEMGPPDPILGVTEAYKRDTNSKKMNLGVGAYRDDNGKPYVLPSVRKAEAQIAAKGLDKEYLPIGGLAEFCRASAELALGENSEVVKSGRFVTVQTISGTGALRIGASFLQRFFKFSRDVFLPKPSWGNHTPIFRDAGMQLQSYRYYDPKTCGFDFTGALEDISKIPEQSVLLLHACAHNPTGVDPRPEQWKEIATVVKKRNLFAFFDMAYQGFASGDGDKDAWAVRHFIEQGINVCLCQSYAKNMGLYGERVGAFTVICKDADEAKRVESQLKILIRPMYSNPPIHGARIASTILTSPDLRKQWLQEVKGMADRIIGMRTQLVSNLKKEGSTHSWQHITDQIGMFCFTGLKPEQVERLTKEFSIYMTKDGRISVAGVTSGNVGYLAHAIHQVTK',
 '@checksum': '12F54284974D27A5',
 '@length': 430,
 '@mass': 47409,
 '@modified': '2013-09-18',
 '@precursor': True,
 '@version': 2}

### Collecting Protein name and synonims and abbreviations

In [177]:
import pandas as pd

df = pd.read_csv("input/8kProteins.csv")
df.head(1)

Unnamed: 0,Entry,Entry name,Protein names,Gene names
0,P16860,ANFB_HUMAN,Natriuretic peptides B (Gamma-brain natriureti...,NPPB


In [180]:
uids = list(df['Entry'])

In [182]:
#for uids

#### From downloaded file

In [None]:
#file =  open("P12345.xml", 'w')
#file.write(draw_data)
#entry_dict = schema.to_dict('./data/P12345.xml')

In [141]:
entry_dict.keys()

dict_keys(['@xmlns', '@xmlns:xsi', '@xsi:schemaLocation', 'entry', 'copyright'])

In [142]:
content = entry_dict['entry'][0]

In [143]:
list(content)[:6]

['@dataset', '@created', '@modified', '@version', 'accession', 'name']

In [144]:
content['accession']

['P12345', 'G1SKL2']

In [145]:
content['protein']

{'alternativeName': [{'fullName': 'Fatty acid-binding protein',
   'shortName': ['FABP-1']},
  {'fullName': 'Glutamate oxaloacetate transaminase 2'},
  {'fullName': 'Kynurenine aminotransferase 4'},
  {'fullName': 'Kynurenine aminotransferase IV'},
  {'fullName': 'Kynurenine--oxoglutarate transaminase 4'},
  {'fullName': 'Kynurenine--oxoglutarate transaminase IV'},
  {'fullName': 'Plasma membrane-associated fatty acid-binding protein',
   'shortName': ['FABPpm']},
  {'fullName': 'Transaminase A'}],
 'recommendedName': {'ecNumber': ['2.6.1.1', '2.6.1.7'],
  'fullName': 'Aspartate aminotransferase, mitochondrial',
  'shortName': ['mAspAT']}}

### References
1. https://gist.github.com/JoaoRodrigues/afe11985e4cab4c0002eebae2213e0a8
2. https://docs.python.org/3/howto/argparse.html
3. https://www.uniprot.org/help/api_retrieve_entries
4. https://blog.liang2.tw/posts/2018/01/read-uniprotkb-xml/