## Uniprot REST API to extract Protein Data

In [1]:
#!pip install xmlschema

### [Programmatic access - Retrieving entries via queries](https://www.uniprot.org/help/api_queries)


In [28]:
"""
Queries Uniprot database and retrieves protein data.
"""
import urllib.request
from urllib.error import HTTPError
from lxml import etree
import lxml.etree
import lxml.html
import json
import xmlschema
import lxml
import requests
from bs4 import BeautifulSoup
import html

#### Query to search a single Protein detail

In [1]:
protein = 'Q6UX04'
format = '.xml'
url = 'https://www.uniprot.org/uniprot/'+protein+ format   
#The URL with the protein.xml at the end
        
with urllib.request.urlopen(url) as r:                     
    #Reads the data from the URL for the particular protein
    raw_data = r.read().strip()

tree = etree.fromstring(raw_data)                          
#Makes a tree out of the protein's raw data from UniProt
schema = xmlschema.XMLSchema('https://www.uniprot.org/docs/uniprot.xsd') 
#Makes a schema out of the XML structure
entry_dict = schema.to_dict(tree)       
#Makes a dictionary out of the XML schema. (What's the 'tree' argument?)
content = entry_dict['entry'][0]        
#The first entry of the dictionary is assigned a variable. It has the xml structure of the protein info
names = content['protein']                      
#The information about the protein and it's other info is displayed here and below

NameError: name 'urllib' is not defined

In [4]:
names

{'alternativeName': [{'fullName': {'$': 'Antigen NY-CO-10',
    '@evidence': [16]}},
  {'fullName': {'$': 'Probable inactive peptidyl-prolyl cis-trans isomerase CWC27 homolog',
    '@evidence': [18]},
   'shortName': [{'$': 'PPIase CWC27', '@evidence': [18]}]},
  {'fullName': 'Serologically defined colon cancer antigen 10'}],
 'recommendedName': {'fullName': {'$': 'Spliceosome-associated protein CWC27 homolog',
   '@evidence': [17]}}}

In [5]:
names['alternativeName']

[{'fullName': {'$': 'Antigen NY-CO-10', '@evidence': [16]}},
 {'fullName': {'$': 'Probable inactive peptidyl-prolyl cis-trans isomerase CWC27 homolog',
   '@evidence': [18]},
  'shortName': [{'$': 'PPIase CWC27', '@evidence': [18]}]},
 {'fullName': 'Serologically defined colon cancer antigen 10'}]

In [6]:
for item in names['alternativeName']:
    print(item['fullName'])

{'@evidence': [16], '$': 'Antigen NY-CO-10'}
{'@evidence': [18], '$': 'Probable inactive peptidyl-prolyl cis-trans isomerase CWC27 homolog'}
Serologically defined colon cancer antigen 10


In [7]:
for item in names['alternativeName']:
    try:
        print(item['fullName']['$'])
    except:
        print(item['fullName'])

Antigen NY-CO-10
Probable inactive peptidyl-prolyl cis-trans isomerase CWC27 homolog
Serologically defined colon cancer antigen 10


In [8]:
names['recommendedName']

{'fullName': {'$': 'Spliceosome-associated protein CWC27 homolog',
  '@evidence': [17]}}

In [9]:
names['recommendedName']['fullName']['$']

'Spliceosome-associated protein CWC27 homolog'

In [10]:
names['recommendedName']['fullName']

{'$': 'Spliceosome-associated protein CWC27 homolog', '@evidence': [17]}

In [11]:
names['recommendedName']['fullName']['$']

'Spliceosome-associated protein CWC27 homolog'

#### Data Extraction

In [12]:
import pandas as pd

df = pd.read_csv("raw-protein.csv")
df.head(10)

Unnamed: 0,Uniprot ID,Gene Name,Uniprot Entry,Uniprot Protein Name
0,O00203,AP3B1,AP3B1_HUMAN,AP-3 complex subunit beta-1 (Adaptor protein c...
1,O60885,BRD4,BRD4_HUMAN,Bromodomain-containing protein 4 (Protein HUNK1)
2,P25440,BRD2,BRD2_HUMAN,Bromodomain-containing protein 2 (O27.1.1) (Re...
3,Q6UX04,CWC27,CWC27_HUMAN,Spliceosome-associated protein CWC27 homolog (...
4,Q86VM9,ZC3H18,ZCH18_HUMAN,Zinc finger CCCH domain-containing protein 18 ...
5,Q8IWA5,SLC44A2,CTL2_HUMAN,Choline transporter-like protein 2 (Solute car...
6,O75439,PMPCB,MPPB_HUMAN,Mitochondrial-processing peptidase subunit bet...
7,O95070,YIF1A,YIF1A_HUMAN,Protein YIF1A (54TMp) (YIP1-interacting factor...
8,P05026,ATP1B1,AT1B1_HUMAN,Sodium/potassium-transporting ATPase subunit b...
9,P11310,ACADM,ACADM_HUMAN,"Medium-chain specific acyl-CoA dehydrogenase, ..."


In [13]:
uids = list(df['Uniprot ID'])

In [18]:
len(uids)

333

In [19]:
format = '.xml'
#The URL with the protein.xml at the end
url = 'https://www.uniprot.org/uniprot/'
#Makes a schema out of the XML structure
schema = xmlschema.XMLSchema('https://www.uniprot.org/docs/uniprot.xsd') 

DATA = []
LOG = []
for ID in uids:
    names = []
    #print(ID)
    ID = str(ID)
    data = {"ID":ID,"names":[]}
    
    search_url = url+ID+format   
    
    try:   
        with urllib.request.urlopen(search_url) as r:                     
            #Reads the data from the URL for the particular protein
            raw_data = r.read().strip()

            tree = etree.fromstring(raw_data)                          
            #Makes a tree out of the protein's raw data from UniProt


            entry_dict = schema.to_dict(tree)       
            #Makes a dictionary out of the XML schema. (What's the 'tree' argument?)
            content = entry_dict['entry'][0]        
            #The first entry of the dictionary is assigned a variable. It has the xml structure of the protein info
            names = content['protein']                      
            #The information about the protein and it's other info is displayed here and below
    except:
        print(ID, "not found!")
        
    if names:
        
        #recomended name---------------------------
        try:
            rName = names['recommendedName']['fullName']
            
            if type(rName)==dict:
                try:
                    data['names'].append(rName['$'])
                except:
                    print("$ rname not found!")
            else:
                try: 
                    data['names'].append(rName)
                except:
                    print("$ rname not found!")
        except:
            LOG.append({ID: "no  reco full name"})
        
        try:
            sName = names['recommendedName']['shortName']
            
            if type(sName)==list:
                try:
                    data['names'].append(sName[0]['$'])
                except:
                    for st in sName:
                        data['names'].append(st)
            else:
                try:
                    data['names'].append(sName)
                except:
                    print("$ sname not found!")
        except:
            LOG.append({ID:"no Short Name"})
            
        #EC number ------------------------------------------    
        try:
            ecnum = names['recommendedName']['ecNumber']
            if type(ecnum) == list:
                for et in ecnum:
                    try:
                        data['names'].append("EC " + et['$'])
                    except:
                        data['names'].append("EC " + et)
        except:
            LOG.append({ID:"no EC num"})
            
            
        #alternative names---------------------------------------    
        try:
            aNames_list = names['alternativeName']
        except:
            LOG.append(ID)
            
        if aNames_list:
            for item in aNames_list:
                try:
                    afName = item["fullName"]
                    if type(afName)==dict:
                        try: 
                            data['names'].append(afName['$'])
                        except:
                            print("alt fullname not found")
                    else: 
                        data['names'].append(afName)
                except:
                    LOG.append({ID:"no alt full name"})

                try:
                    asName = item["shortName"]
                    if type(asName)==list:
                        try: 
                            data['names'].append(asName[0]['$'])
                        except:
                            for st in asName:
                                data['names'].append(st)
                    else:
                        data['names'].append(asName)
                except:
                    LOG.append({ID: "no reco short name"})
    DATA.append(data)
    #print(names)
    #print("=========================================")
    #print(data)
    #print("-------------------------------------")

In [20]:
with open("covid-proteins.json",'w') as f:
    json.dump(DATA,f)

In [21]:
DATA[0:10]

[{'ID': 'O00203',
  'names': ['AP-3 complex subunit beta-1',
   'Adaptor protein complex AP-3 subunit beta-1',
   'Adaptor-related protein complex 3 subunit beta-1',
   'Beta-3A-adaptin',
   'Clathrin assembly protein complex 3 beta-1 large chain']},
 {'ID': 'O60885',
  'names': ['Bromodomain-containing protein 4', 'Protein HUNK1']},
 {'ID': 'P25440',
  'names': ['Bromodomain-containing protein 2',
   'O27.1.1',
   'Really interesting new gene 3 protein']},
 {'ID': 'Q6UX04',
  'names': ['Spliceosome-associated protein CWC27 homolog',
   'Antigen NY-CO-10',
   'Probable inactive peptidyl-prolyl cis-trans isomerase CWC27 homolog',
   'PPIase CWC27',
   'Serologically defined colon cancer antigen 10']},
 {'ID': 'Q86VM9',
  'names': ['Zinc finger CCCH domain-containing protein 18',
   'Nuclear protein NHN1']},
 {'ID': 'Q8IWA5',
  'names': ['Choline transporter-like protein 2',
   'Solute carrier family 44 member 2']},
 {'ID': 'O75439',
  'names': ['Mitochondrial-processing peptidase subuni

In [22]:
len(DATA)

333

In [26]:
with open("covid-protein.txt", 'w') as f:
    for item in DATA:
        ID = item['ID']
        line = ID + "|"
        names = item['names']
        for n in names:
            n = n.lower()
            n = n.replace(" ","_")
            line = line + n + "|"
        f.write(line[0:-1])
        f.write("\n")

In [27]:
with open("covs.txt", 'w') as f:
    for item in DATA:
        line = ""
        names = item['names']
        for n in names:
            n = n.lower()
            n = n.replace(" ","_")
            line = line + n + "|"
        f.write(line[0:-1])
        f.write("\n")

#### Final Data

In [29]:
with open("covid-proteins.json",'r') as f:
    DATA = json.load(f)

In [30]:
DF = pd.DataFrame(DATA)

In [34]:
DF['rname'] = DF['names'].apply(lambda x : x[0])

In [35]:
DF.head()

Unnamed: 0,ID,names,rname
0,O00203,"[AP-3 complex subunit beta-1, Adaptor protein ...",AP-3 complex subunit beta-1
1,O60885,"[Bromodomain-containing protein 4, Protein HUNK1]",Bromodomain-containing protein 4
2,P25440,"[Bromodomain-containing protein 2, O27.1.1, Re...",Bromodomain-containing protein 2
3,Q6UX04,"[Spliceosome-associated protein CWC27 homolog,...",Spliceosome-associated protein CWC27 homolog
4,Q86VM9,[Zinc finger CCCH domain-containing protein 18...,Zinc finger CCCH domain-containing protein 18


In [36]:
DF.to_csv("covid-proteins.csv")

In [38]:
id2name = {}
for ID, name in zip(DF['ID'],DF['rname']):
    id2name.update({ID:name})
    

In [40]:
#id2name

In [41]:
with open("id2name.json",'w') as f:
    json.dump(id2name,f)