# Make Batch API calls to UniProt, download and parse all data

In [165]:
import pandas as pd
import numpy as np
import requests
from io import StringIO
import json
import pickle

import data_utils as du
data_dir = du.find_data_dir('Programming')

BASE = 'http://www.uniprot.org'
BASE = 'https://www.ebi.ac.uk/proteins/api'
PROTEINS_ENDPOINT = '/proteins/'
KB_ENDPOINT = '/uniprot/'

In [46]:
def API_call_protein(uniprot_id):
    response = requests.get(BASE+PROTEINS_ENDPOINT+uniprot_id)
    return json.loads(response.text)

In [48]:
accession = 'Q96R72'
protein_json = API_call_protein(accession)
protein_json

{'accession': 'Q96R72',
 'id': 'OR4K3_HUMAN',
 'proteinExistence': 'Inferred from homology',
 'info': {'type': 'Swiss-Prot',
  'created': '2003-10-10',
  'modified': '2021-02-10',
  'version': 112},
 'organism': {'taxonomy': 9606,
  'names': [{'type': 'scientific', 'value': 'Homo sapiens'},
   {'type': 'common', 'value': 'Human'}],
  'lineage': ['Eukaryota',
   'Metazoa',
   'Chordata',
   'Craniata',
   'Vertebrata',
   'Euteleostomi',
   'Mammalia',
   'Eutheria',
   'Euarchontoglires',
   'Primates',
   'Haplorrhini',
   'Catarrhini',
   'Hominidae',
   'Homo']},
 'secondaryAccession': ['Q6IFA4'],
 'protein': {'recommendedName': {'fullName': {'value': 'Olfactory receptor 4K3'}},
  'alternativeName': [{'fullName': {'value': 'Olfactory receptor OR14-14'}}]},
 'gene': [{'name': {'value': 'OR4K3'}, 'synonyms': [{'value': 'OR4K3P'}]}],
 'comments': [{'type': 'FUNCTION',
   'text': [{'value': 'Odorant receptor',
     'evidences': [{'code': 'ECO:0000305'}]}]},
  {'type': 'SUBCELLULAR_LOCAT

In [135]:
payload = {'organism': 'Homo sapiens',
           'size': 100,
           'format': 'json',
           'offset':100}
response = requests.get(BASE+PROTEINS_ENDPOINT, params=payload)
proteins = json.loads(response.text)

In [136]:
def extract_protein_dict(protein):
    keys = ['accession', 'id', 'proteinExistence']
    protein_dict = {}

    try:
        protein_dict['full_name'] = protein['protein']['recommendedName']['fullName']['value']
    except:
        protein_dict['full_name'] = None
    
    for key in keys:
        protein_dict[key] = protein[key]

    protein_dict['organism_name'] = next(name['value'] for name in protein['organism']['names'] if name["type"] == "scientific")
    
    try:
        protein_dict['ecNumber'] = protein['protein']['recommendedName']['ecNumber'][0]['value']
    except:
        protein_dict['ecNumber'] = None

    if 'comments' in protein.keys():
        try:
            protein_dict['FUNCTION'] = next(comment['text'][0]['value'] for comment in protein['comments'] if comment["type"] == "FUNCTION")
        except:
            protein_dict['FUNCTION'] = None
        
        try:
            protein_dict['CATALYTIC_ACTIVITY'] = next(comment['reaction']['name'] for comment in protein['comments'] if comment["type"] == "CATALYTIC_ACTIVITY")
        except:
            protein_dict['CATALYTIC_ACTIVITY'] = None
        
        try:
            protein_dict['SIMILARITY'] = next(comment['text'][0]['value'] for comment in protein['comments'] if comment["type"] == "SIMILARITY")
        except:
            protein_dict['SIMILARITY'] = None
    else:
        protein_dict['FUNCTION'] = None
        protein_dict['CATALYTIC_ACTIVITY'] = None
        protein_dict['SIMILARITY'] = None

    other_dbs = ['EMBL', 'PeptideAtlas', 'GO', 'PANTHER', 'Pfam', 'PROSITE', 'InterPro', 'SMART']

    for db_name in other_dbs:
        try:
            protein_dict[db_name + '_id'] = next(dbReference['id'] for dbReference in protein['dbReferences'] if dbReference["type"] == db_name)
        except:
            protein_dict[db_name + '_id'] = None

    protein_dict['sequence_length'] = protein['sequence']['length']
    protein_dict['mass'] = protein['sequence']['mass']
    protein_dict['sequence'] = protein['sequence']['sequence']

    return protein_dict

def convert_json_to_df(proteins):
    proteins_dicts = [None]*len(proteins)
    for i, protein in enumerate(proteins):
        proteins_dicts[i] = extract_protein_dict(protein)

    return pd.DataFrame(proteins_dicts)

convert_json_to_df(proteins)

Unnamed: 0,full_name,accession,id,proteinExistence,organism_name,ecNumber,FUNCTION,CATALYTIC_ACTIVITY,SIMILARITY,EMBL_id,PeptideAtlas_id,GO_id,PANTHER_id,Pfam_id,PROSITE_id,InterPro_id,SMART_id,sequence_length,mass,sequence
0,NADH-ubiquinone oxidoreductase chain 4,A0A023Q6X3,A0A023Q6X3_HUMAN,Inferred from homology,Homo sapiens,7.1.1.2,Core subunit of the mitochondrial membrane res...,a ubiquinone + 5 H(+)(in) + NADH = a ubiquinol...,Belongs to the complex I subunit 4 family,KJ445864,,GO:0016021,PTHR43507,PF01059,,IPR000260,,459,51579,MLKLIVPTIMLLPLTWLSKKHMIWINTTTHSLIISIIPLLFFNQIN...
1,NADH-ubiquinone oxidoreductase chain 1,A0A023Q700,A0A023Q700_HUMAN,Inferred from homology,Homo sapiens,7.1.1.2,,a ubiquinone + 5 H(+)(in) + NADH = a ubiquinol...,Belongs to the complex I subunit 1 family,KJ445867,A0A023Q700,GO:0016021,PTHR11432,PF00146,PS00668,IPR001694,,318,35650,MPMANLLLLIVPILIAMAFLMLTERKILGYMQLRKGPNVVGPYGLL...
2,NADH-ubiquinone oxidoreductase chain 5,A0A023Q756,A0A023Q756_HUMAN,Inferred from homology,Homo sapiens,7.1.1.2,Core subunit of the mitochondrial membrane res...,a ubiquinone + 5 H(+)(in) + NADH = a ubiquinol...,Belongs to the complex I subunit 5 family,KJ445872,A0A023Q756,GO:0016021,,PF06455,,IPR010934,,603,67069,MTMHTTMTTLTLTSLIPPILTTLVNPNKKNSYPHYVKSIVASTFII...
3,ATP synthase subunit a,A0A023Q788,A0A023Q788_HUMAN,Inferred from homology,Homo sapiens,,Mitochondrial membrane ATP synthase (F(1)F(0) ...,,Belongs to the ATPase A chain family,KJ445850,A0A023Q788,GO:0016021,,PF00119,PS00449,IPR000568,,226,24767,MNENLFASFIAPTILGLPAAVLIILFPPLLIPTSKYLINNRLITTQ...
4,Cytochrome b,A0A023Q797,A0A023Q797_HUMAN,Inferred from homology,Homo sapiens,,Component of the ubiquinol-cytochrome c reduct...,,Belongs to the cytochrome b family,KF451781,A0A023Q797,GO:0016021,,PF00032,PS51003,IPR005798,,380,42740,MTPMRKINPLMKLINHSFIDLPTPSNISXWWNFGSLLGACLILQIT...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,ATP synthase subunit a,A0A023QTT5,A0A023QTT5_HUMAN,Inferred from homology,Homo sapiens,,Mitochondrial membrane ATP synthase (F(1)F(0) ...,,Belongs to the ATPase A chain family,KF451007,A0A023QTT5,GO:0016021,,PF00119,PS00449,IPR000568,,226,24771,MNENLFASFIAPTILGLPATVLIILLPPLLIPTSKYLINNRLITTQ...
96,NADH-ubiquinone oxidoreductase chain 4,A0A023QTW4,A0A023QTW4_HUMAN,Inferred from homology,Homo sapiens,7.1.1.2,Core subunit of the mitochondrial membrane res...,a ubiquinone + 5 H(+)(in) + NADH = a ubiquinol...,Belongs to the complex I subunit 4 family,KF451842,,GO:0016021,PTHR43507,PF01059,,IPR000260,,459,51609,MLKLIVPTIMLLPLTWLSKKHMIWINTTTHSLIISIIPLLFFNQIN...
97,NADH-ubiquinone oxidoreductase chain 5,A0A023QTY8,A0A023QTY8_HUMAN,Inferred from homology,Homo sapiens,7.1.1.2,Core subunit of the mitochondrial membrane res...,a ubiquinone + 5 H(+)(in) + NADH = a ubiquinol...,Belongs to the complex I subunit 5 family,KF451601,A0A023QTY8,GO:0016021,,PF06455,,IPR010934,,603,66984,MTMHTTMTTLTLTSLIPPILTTLVNPNKKNSYPHYVKSIVASTFII...
98,NADH-ubiquinone oxidoreductase chain 5,A0A023QUA7,A0A023QUA7_HUMAN,Inferred from homology,Homo sapiens,7.1.1.2,Core subunit of the mitochondrial membrane res...,a ubiquinone + 5 H(+)(in) + NADH = a ubiquinol...,Belongs to the complex I subunit 5 family,KF451185,A0A023QUA7,GO:0016021,,PF06455,,IPR010934,,603,67081,MTMHTTMTTLTLTSLIPPILTTLVNPNKKNSYPHYVKSIVASTFII...


### BATCH download JSONs from API

In [162]:
protein_features = ['full_name', 'accession', 'id', 'proteinExistence', 'organism_name',
                     'ecNumber', 'FUNCTION', 'CATALYTIC_ACTIVITY', 'SIMILARITY', 'EMBL_id', 
                     'PeptideAtlas_id', 'GO_id', 'PANTHER_id', 'Pfam_id', 'PROSITE_id',
                     'InterPro_id', 'SMART_id', 'sequence_length', 'mass', 'sequence']

target_df = pd.DataFrame([], columns = protein_features)

start = 0
end = 1000
batchsize = 100
offsets = np.arange(start, end, batchsize)

for offset in offsets:
    payload = {'organism': 'Homo sapiens',
               'format': 'json',
               'size': batchsize,
               'offset': offset}
    
#     print(f'{offset}-{offset+batchsize}')
    
    response = requests.get(BASE+PROTEINS_ENDPOINT, params=payload)
    proteins = json.loads(response.text)
    
    target_df = pd.concat([target_df, convert_json_to_df(proteins)])

0-100
100-200
200-300
300-400
400-500
500-600
600-700
700-800
800-900
900-1000
1000-1100
1100-1200
1200-1300
1300-1400
1400-1500
1500-1600
1600-1700
1700-1800
1800-1900
1900-2000
2000-2100
2100-2200
2200-2300
2300-2400
2400-2500
2500-2600
2600-2700
2700-2800
2800-2900
2900-3000
3000-3100
3100-3200
3200-3300
3300-3400
3400-3500
3500-3600
3600-3700
3700-3800
3800-3900
3900-4000
4000-4100
4100-4200
4200-4300
4300-4400
4400-4500
4500-4600
4600-4700
4700-4800
4800-4900
4900-5000
5000-5100
5100-5200
5200-5300
5300-5400
5400-5500
5500-5600
5600-5700
5700-5800
5800-5900
5900-6000
6000-6100
6100-6200
6200-6300
6300-6400
6400-6500
6500-6600
6600-6700
6700-6800
6800-6900
6900-7000
7000-7100
7100-7200
7200-7300
7300-7400
7400-7500
7500-7600
7600-7700
7700-7800
7800-7900
7900-8000
8000-8100
8100-8200
8200-8300
8300-8400
8400-8500
8500-8600
8600-8700
8700-8800
8800-8900
8900-9000
9000-9100
9100-9200
9200-9300
9300-9400
9400-9500
9500-9600
9600-9700
9700-9800
9800-9900
9900-10000


In [169]:
target_df

Unnamed: 0,full_name,accession,id,proteinExistence,organism_name,ecNumber,FUNCTION,CATALYTIC_ACTIVITY,SIMILARITY,EMBL_id,PeptideAtlas_id,GO_id,PANTHER_id,Pfam_id,PROSITE_id,InterPro_id,SMART_id,sequence_length,mass,sequence
0,Methylcytosine dioxygenase TET,A0A023HHK9,A0A023HHK9_HUMAN,Evidence at transcript level,Homo sapiens,1.14.11.n2,Dioxygenase that catalyzes the conversion of t...,2-oxoglutarate + a 5-methyl-2'-deoxycytidine i...,Belongs to the TET family,JX311859,A0A023HHK9,GO:0005634,PTHR23358,PF12851,PS51058,IPR024779,SM01333,1305,142799,MSRSRHARPSRLVRKEDVNKKKKNSQLRKTTKGANKNVASVKTLSP...
1,Methylcytosine dioxygenase TET,A0A023HHL0,A0A023HHL0_HUMAN,Evidence at transcript level,Homo sapiens,1.14.11.n2,Dioxygenase that catalyzes the conversion of t...,2-oxoglutarate + a 5-methyl-2'-deoxycytidine i...,Belongs to the TET family,JX311858,A0A023HHL0,GO:0005634,PTHR23358,PF02008,PS51058,IPR040175,,694,75604,MSRSRHARPSRLVRKEDVNKKKKNSQLRKTTKGANKNVASVKTLSP...
2,,A0A023HJ61,A0A023HJ61_HUMAN,Evidence at transcript level,Homo sapiens,,,,,KC407675,A0A023HJ61,GO:0005525,,PF00071,,IPR027417,,121,13596,MSQTAMSETYDFLFKFLVIGNAGTGKSCLLHQFIEKKFKDDSNHTI...
3,,A0A023HN28,A0A023HN28_HUMAN,Evidence at transcript level,Homo sapiens,,,,,KC921221,,,,,,,,16,1865,MDMVENADSLQAQERK
4,Cytochrome b,A0A023I7F4,A0A023I7F4_HUMAN,Inferred from homology,Homo sapiens,,Component of the ubiquinol-cytochrome c reduct...,,Belongs to the cytochrome b family,KC990648,A0A023I7F4,GO:0016021,,PF00032,PS51003,IPR005798,,380,42790,MTPMRKINPLMKLINHSFIDLPTPSNISAWWNFGSLLGACLILQIT...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,,A0A0A7C5T6,A0A0A7C5T6_HUMAN,Inferred from homology,Homo sapiens,,,,Belongs to the MHC class I family,KC875894,A0A0A7C5T6,,,PF00129,,IPR011161,,181,21077,SHSMRYFYTSVSRPGRGEPRFISVGYVDDTQFVRFDSDAASPREEP...
96,,A0A0A7C5T7,A0A0A7C5T7_HUMAN,Predicted,Homo sapiens,,,,,KC632189,A0A0A7C5T7,GO:0042613,,PF00969,,IPR011162,SM00921,89,10626,DFVYQFKGMCYFTNGTERVRLVSRSIYNREEIVRFDSDVGEFRAVT...
97,,A0A0A7C5T8,A0A0A7C5T8_HUMAN,Predicted,Homo sapiens,,,,,KC875987,,GO:0042613,,PF00969,,IPR011162,SM00921,87,10506,NYLFQGRQECYAFNGTQRFLERYIYNREEFARFDSDVGEFRAVTEL...
98,,A0A0A7C5T9,A0A0A7C5T9_HUMAN,Predicted,Homo sapiens,,,,,KF998325,A0A0A7C5T9,GO:0042613,,PF00969,,IPR011162,SM00921,87,10562,NYLFQGRQECYAFNGTQRFLERYIYNREEFVRFDSDVGEFRAVTEL...


In [163]:
def dump_in_pickle(filename, data):
    with open(filename, 'wb') as f:
        pickle.dump(data, f)

def read_from_pickle(filename):
    with open(filename, 'rb') as f:
        data = pickle.load(f)
    return data

In [None]:
dump_filename = du.get_file_path(data_dir, 'UniProt proteins', 'Parsed pickle', 'protein_features.p')
dump_in_pickle(dump_filename, target_df)

### Example of payload

In [18]:
payload = {'query': 'name:"polymerase alpha" AND taxonomy:mus AND reviewed:yes',
           'format': 'tab'}
response = requests.get(BASE+KB_ENDPOINT, params=payload)

In [25]:
f = StringIO(response.text)
pd.read_csv(f, delimiter='\t')

Unnamed: 0,Entry,Entry name,Status,Protein names,Gene names,Organism,Length
0,Q61183,PAPOA_MOUSE,reviewed,Poly(A) polymerase alpha (PAP-alpha) (EC 2.7.7...,Papola Pap Plap,Mus musculus (Mouse),739
1,P25206,MCM3_MOUSE,reviewed,DNA replication licensing factor MCM3 (EC 3.6....,Mcm3 Mcmd Mcmd3,Mus musculus (Mouse),812
2,P33611,DPOA2_MOUSE,reviewed,DNA polymerase alpha subunit B (DNA polymerase...,Pola2,Mus musculus (Mouse),600
3,P33609,DPOLA_MOUSE,reviewed,DNA polymerase alpha catalytic subunit (EC 2.7...,Pola1 Pola,Mus musculus (Mouse),1465


### Read uniprot human proteints export

In [70]:
%%time
pd.read_csv('uniprot-filtered-organism__Homo+sapiens+(Human)+[9606]_.tab', delimiter='\t')

Wall time: 329 ms


Unnamed: 0,Entry,Entry name,Status,Protein names,Gene names,Organism,Length
0,Q96R72,OR4K3_HUMAN,reviewed,Olfactory receptor 4K3 (Olfactory receptor OR1...,OR4K3 OR4K3P,Homo sapiens (Human),315
1,Q9UKL2,O52A1_HUMAN,reviewed,Olfactory receptor 52A1 (HPFH1OR) (Odorant rec...,OR52A1,Homo sapiens (Human),312
2,Q9H205,O2AG1_HUMAN,reviewed,Olfactory receptor 2AG1 (HT3) (Olfactory recep...,OR2AG1 OR2AG3,Homo sapiens (Human),316
3,Q8NGN2,O10S1_HUMAN,reviewed,Olfactory receptor 10S1 (Olfactory receptor OR...,OR10S1,Homo sapiens (Human),331
4,Q8NGC1,O11G2_HUMAN,reviewed,Olfactory receptor 11G2 (Olfactory receptor OR...,OR11G2,Homo sapiens (Human),345
...,...,...,...,...,...,...,...
194517,A0A1X9MM07,A0A1X9MM07_HUMAN,unreviewed,ATP synthase subunit a,ATP6,Homo sapiens (Human),226
194518,B2R615,B2R615_HUMAN,unreviewed,K(+) channel subunit beta-2 (Kv-beta-2) (Volta...,,Homo sapiens (Human),367
194519,A0A0A7C4K8,A0A0A7C4K8_HUMAN,unreviewed,MHC class I antigen (Fragment),HLA-A,Homo sapiens (Human),181
194520,H9A532,H9A532_HUMAN,unreviewed,BCL6 corepressor-cyclin B3 fusion protein,,Homo sapiens (Human),3038
