In [2]:
import requests
import pandas as pd

In [19]:
url = 'https://sparql.uniprot.org/sparql/'
query = '''
PREFIX up: <http://purl.uniprot.org/core/>
PREFIX keywords: <http://purl.uniprot.org/keywords/>
PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
PREFIX skos: <http://www.w3.org/2004/02/skos/core#>
SELECT DISTINCT
    (CONCAT(SUBSTR(STR(?protein), 33)) AS ?entry_id)
    ?taxonomic_rank
    ?taxon
    ?keyword
    ?keyword_category
    ?sequence
    ?reviewed
WHERE
{
    ?protein a up:Protein .
    ?protein rdfs:seeAlso <http://purl.uniprot.org/pfam/PF00123> .
    ?protein up:sequence ?sequence_class .
    FILTER NOT EXISTS { ?sequence_class up:basedOn ?primary_sequence }
    FILTER NOT EXISTS { ?sequence_class up:precursor true }
    FILTER NOT EXISTS { ?sequence_class up:fragment 'single' }
    FILTER NOT EXISTS { ?sequence_class up:fragment 'multiple' }
    ?sequence_class rdf:value ?sequence .
    OPTIONAL {
        ?protein up:organism ?organism .
        ?organism rdfs:subClassOf ?ancestor .
        ?ancestor up:rank ?taxonomic_rank_class .
        BIND(SUBSTR(STR(?taxonomic_rank_class), 30) AS ?taxonomic_rank) .
        ?ancestor up:scientificName ?taxon .
    }
    OPTIONAL {
        ?protein up:classifiedWith ?keyword_class .
        {
            ?keyword_class rdfs:subClassOf keywords:9999 .
            BIND('biological_process' AS ?keyword_category)
        } UNION {
            ?keyword_class rdfs:subClassOf keywords:9998 .
            BIND('cellular_component' AS ?keyword_category)
        } UNION {
            ?keyword_class rdfs:subClassOf keywords:9997 .
            BIND('coding_sequence_diversity' AS ?keyword_category)
        } UNION {
            ?keywords_class rdfs:subClassOf keywords:9996 .
            BIND('developmental_stage' AS ?keyword_category)
        } UNION {
            ?keyword_class rdfs:subClassOf keywords:9995 .
            BIND('disease' AS ?keyword_category)
        } UNION {
            ?keyword_class rdfs:subClassOf keywords:9994 .
            BIND('domain' AS ?keyword_category)
        } UNION {
            ?keyword_class rdfs:subClassOf keywords:9993 .
            BIND('ligand' AS ?keyword_category)
        } UNION {
            ?keyword_class rdfs:subClassOf keywords:9992 .
            BIND('molecular_function' AS ?keyword_category)
        } UNION {
            ?keyword_class rdfs:subClassOf keywords:9991 .
            BIND('post_translational_modification' AS ?keyword_category)
        } UNION {
            ?keyword_class rdfs:subClassOf keywords:9990 .
            BIND('technical_term' AS ?keyword_category)
        }
        ?keyword_class skos:prefLabel ?keyword .
    }
    OPTIONAL { ?protein up:reviewed ?reviewed_true . }
    BIND(IF(BOUND(?reviewed_true), 'true', 'false') AS ?reviewed)
}
'''
r = requests.get(url, params={'format': 'tab', 'query': query}, headers={'From': 'nilasschusler@gmail.com'}, stream=True)

In [20]:
outfile = open('data/data.csv','wb')
for chunk in r.iter_content(chunk_size=128):
    outfile.write(chunk)
outfile.close()

In [39]:
data = pd.read_csv('data/data.csv', sep='\t')
data.columns = [column.strip() for column in data.columns]
data

Unnamed: 0,entry_id,taxonomic_rank,taxon,keyword,keyword_category,sequence,reviewed
0,A0A151M097,Order,Crocodylia,Cell adhesion,biological_process,MDKVSTLYIEYPKAMKMKSVYFVAGLLLMIVQGSWQNPLQDTEEKS...,True
1,A0A151M097,Family,Alligatoridae,Cell adhesion,biological_process,MDKVSTLYIEYPKAMKMKSVYFVAGLLLMIVQGSWQNPLQDTEEKS...,True
2,A0A151M097,Superkingdom,Eukaryota,Cell adhesion,biological_process,MDKVSTLYIEYPKAMKMKSVYFVAGLLLMIVQGSWQNPLQDTEEKS...,True
3,A0A151M097,Kingdom,Metazoa,Cell adhesion,biological_process,MDKVSTLYIEYPKAMKMKSVYFVAGLLLMIVQGSWQNPLQDTEEKS...,True
4,A0A151M097,Phylum,Chordata,Cell adhesion,biological_process,MDKVSTLYIEYPKAMKMKSVYFVAGLLLMIVQGSWQNPLQDTEEKS...,True
...,...,...,...,...,...,...,...
44174,A0A3B4XXR0,Subclass,Neopterygii,,,MASSSKATLILLIYGILMHYSVFCTPIGLSYPKIRLENDAFDEDGN...,True
44175,A0A3B4XXR0,Phylum,Chordata,,,MASSSKATLILLIYGILMHYSVFCTPIGLSYPKIRLENDAFDEDGN...,True
44176,A0A3B4XXR0,Superclass,Actinopterygii,,,MASSSKATLILLIYGILMHYSVFCTPIGLSYPKIRLENDAFDEDGN...,True
44177,A0A3B4XXR0,Family,Carangidae,,,MASSSKATLILLIYGILMHYSVFCTPIGLSYPKIRLENDAFDEDGN...,True


In [40]:
data_wide = data.pivot(index=['entry_id', 'sequence', 'reviewed', 'keyword', 'keyword_category'], columns='taxonomic_rank', values='taxon')
data_wide = data_wide.reset_index(level=['entry_id', 'sequence', 'reviewed', 'keyword', 'keyword_category'])
data_wide.columns.names = ['']

In [41]:
with pd.option_context('display.max_columns', None):
    display(data_wide)

Unnamed: 0,entry_id,sequence,reviewed,keyword,keyword_category,Class,Cohort,Family,Genus,Infraclass,Infraorder,Kingdom,Order,Parvorder,Phylum,Species,Species_Group,Subclass,Subcohort,Subfamily,Subgenus,Subkingdom,Suborder,Subphylum,Superclass,Superfamily,Superkingdom,Superorder,Tribe
0,A0A060VPW7,MSSKATLALLIYGIIMHYSIHCSPLGLSYPNLRLENEVYDEDGNSL...,True,Reference proteome,technical_term,Actinopteri,Euteleosteomorpha,Salmonidae,Oncorhynchus,Teleostei,,Metazoa,Salmoniformes,,Chordata,,,Neopterygii,,Salmoninae,,,,Craniata,Actinopterygii,,Eukaryota,,
1,A0A060VPW7,MSSKATLALLIYGIIMHYSIHCSPLGLSYPNLRLENEVYDEDGNSL...,True,Signal,domain,Actinopteri,Euteleosteomorpha,Salmonidae,Oncorhynchus,Teleostei,,Metazoa,Salmoniformes,,Chordata,,,Neopterygii,,Salmoninae,,,,Craniata,Actinopterygii,,Eukaryota,,
2,A0A060VXS0,MKGTHSFAGLLLLIIAQSSLQIPQEDTEDNSSLLTEDSMFSEPREL...,True,Reference proteome,technical_term,Actinopteri,Euteleosteomorpha,Salmonidae,Oncorhynchus,Teleostei,,Metazoa,Salmoniformes,,Chordata,,,Neopterygii,,Salmoninae,,,,Craniata,Actinopterygii,,Eukaryota,,
3,A0A060VXS0,MKGTHSFAGLLLLIIAQSSLQIPQEDTEDNSSLLTEDSMFSEPREL...,True,Signal,domain,Actinopteri,Euteleosteomorpha,Salmonidae,Oncorhynchus,Teleostei,,Metazoa,Salmoniformes,,Chordata,,,Neopterygii,,Salmoninae,,,,Craniata,Actinopterygii,,Eukaryota,,
4,A0A060VY52,MKSTHSFAGLLLLIIVQSSWQIPQESTEDNSSLLTEDSMFSEPREL...,True,Reference proteome,technical_term,Actinopteri,Euteleosteomorpha,Salmonidae,Oncorhynchus,Teleostei,,Metazoa,Salmoniformes,,Chordata,,,Neopterygii,,Salmoninae,,,,Craniata,Actinopterygii,,Eukaryota,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3468,W5PIG8,MLTRKSVLKPRSDRFLLTAQINRSRGLVTSVITTGSSRRNRSATLS...,True,Amidation,post_translational_modification,Mammalia,,Bovidae,Ovis,,Pecora,Metazoa,Artiodactyla,,Chordata,,,,,Caprinae,,,Ruminantia,Craniata,Sarcopterygii,,Eukaryota,Laurasiatheria,
3469,W5PIG8,MLTRKSVLKPRSDRFLLTAQINRSRGLVTSVITTGSSRRNRSATLS...,True,Reference proteome,technical_term,Mammalia,,Bovidae,Ovis,,Pecora,Metazoa,Artiodactyla,,Chordata,,,,,Caprinae,,,Ruminantia,Craniata,Sarcopterygii,,Eukaryota,Laurasiatheria,
3470,W5Q8U4,MLLWVFFLVTLTLSSGSQGSLPSQPLRIPRYADAIFTNSYRKILGQ...,True,Amidation,post_translational_modification,Mammalia,,Bovidae,Ovis,,Pecora,Metazoa,Artiodactyla,,Chordata,,,,,Caprinae,,,Ruminantia,Craniata,Sarcopterygii,,Eukaryota,Laurasiatheria,
3471,W5Q8U4,MLLWVFFLVTLTLSSGSQGSLPSQPLRIPRYADAIFTNSYRKILGQ...,True,Reference proteome,technical_term,Mammalia,,Bovidae,Ovis,,Pecora,Metazoa,Artiodactyla,,Chordata,,,,,Caprinae,,,Ruminantia,Craniata,Sarcopterygii,,Eukaryota,Laurasiatheria,


In [42]:
data_wide['entry_id'].nunique()

1847

In [43]:
data_wide['keyword'].unique()

array(['Reference proteome', 'Signal', 'Secreted',
       'Cleavage on pair of basic residues', 'Hormone', 'Coiled coil',
       'Transmembrane helix', 'Neuropeptide', 'Proteomics identification',
       'Cell adhesion', 'Hydrolase', nan, 'Ion transport', 'ATP-binding',
       'Cytoplasm', 'Immunity', 'Phosphoprotein', 'Cell membrane',
       'Cell projection', 'Disulfide bond', 'Lyase', 'Amidation',
       'Transport', 'Transferase', 'Direct protein sequencing',
       'Antiviral defense', 'Helicase', 'Innate immunity',
       'Metal-binding', 'Zinc',
       'G-protein coupled receptor impairing toxin', 'Glycoprotein',
       'Hypotensive agent', '3D-structure'], dtype=object)