In [6]:
import requests
import pandas as pd

In [55]:
url = 'https://sparql.uniprot.org/sparql/'
query = '''
PREFIX up: <http://purl.uniprot.org/core/>
PREFIX keywords: <http://purl.uniprot.org/keywords/>
PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
PREFIX skos: <http://www.w3.org/2004/02/skos/core#>
PREFIX faldo: <http://biohackathon.org/resource/faldo#>
SELECT DISTINCT
    (CONCAT(SUBSTR(STR(?protein), 33)) AS ?entry_id)
    ?taxonomic_rank
    ?taxon
    ?keyword
    ?keyword_category
    ?sequence
    ?reviewed
    ?subsequence
WHERE
{
    ?protein a up:Protein .
    ?protein up:annotation ?annotaion .
    ?annotaion rdf:type up:Domain_Extent_Annotation .
    ?annotaion rdfs:comment 'GLUCAGON' .
    ?protein up:sequence ?sequence_class .
    FILTER NOT EXISTS { ?sequence_class up:basedOn ?primary_sequence }
    #FILTER NOT EXISTS { ?sequence_class up:precursor true }
    FILTER NOT EXISTS { ?sequence_class up:fragment 'single' }
    FILTER NOT EXISTS { ?sequence_class up:fragment 'multiple' }
    ?sequence_class rdf:value ?sequence .
    ?annotaion up:range ?range .
    ?range faldo:end ?end_class .
    ?range faldo:begin ?begin_class .
    ?end_class faldo:position ?end .
    ?begin_class faldo:position ?begin .
    BIND(SUBSTR(?sequence, ?begin, ?end-?begin+1) AS ?subsequence)
    OPTIONAL {
        ?protein up:organism ?organism .
        ?organism rdfs:subClassOf ?ancestor .
        ?ancestor up:rank ?taxonomic_rank_class .
        BIND(SUBSTR(STR(?taxonomic_rank_class), 30) AS ?taxonomic_rank) .
        ?ancestor up:scientificName ?taxon .
    }
    OPTIONAL {
        ?protein up:classifiedWith ?keyword_class .
        {
            ?keyword_class rdfs:subClassOf keywords:9999 .
            BIND('biological_process' AS ?keyword_category)
        } UNION {
            ?keyword_class rdfs:subClassOf keywords:9998 .
            BIND('cellular_component' AS ?keyword_category)
        } UNION {
            ?keyword_class rdfs:subClassOf keywords:9997 .
            BIND('coding_sequence_diversity' AS ?keyword_category)
        } UNION {
            ?keywords_class rdfs:subClassOf keywords:9996 .
            BIND('developmental_stage' AS ?keyword_category)
        } UNION {
            ?keyword_class rdfs:subClassOf keywords:9995 .
            BIND('disease' AS ?keyword_category)
        } UNION {
            ?keyword_class rdfs:subClassOf keywords:9994 .
            BIND('domain' AS ?keyword_category)
        } UNION {
            ?keyword_class rdfs:subClassOf keywords:9993 .
            BIND('ligand' AS ?keyword_category)
        } UNION {
            ?keyword_class rdfs:subClassOf keywords:9992 .
            BIND('molecular_function' AS ?keyword_category)
        } UNION {
            ?keyword_class rdfs:subClassOf keywords:9991 .
            BIND('post_translational_modification' AS ?keyword_category)
        } UNION {
            ?keyword_class rdfs:subClassOf keywords:9990 .
            BIND('technical_term' AS ?keyword_category)
        }
        ?keyword_class skos:prefLabel ?keyword .
    }
    OPTIONAL { ?protein up:reviewed ?reviewed_true . }
    BIND(IF(BOUND(?reviewed_true), 'true', 'false') AS ?reviewed)
}
'''
r = requests.get(url, params={'format': 'tab', 'query': query}, headers={'From': 'nilasschusler@gmail.com'}, stream=True)

In [56]:
outfile = open('data/data.csv','wb')
for chunk in r.iter_content(chunk_size=128):
    outfile.write(chunk)
outfile.close()

In [57]:
data = pd.read_csv('data/data.csv', sep='\t')
data.columns = [column.strip() for column in data.columns]
data

Unnamed: 0,entry_id,taxonomic_rank,taxon,keyword,keyword_category,sequence,reviewed,subsequence
0,G3HZV5,Subfamily,Cricetinae,Cell adhesion,biological_process,MKAGAKNPVVRVFIVDTTYPHHVGPIEVPVPEMIASRFYSSNEFEG...,True,HSQGTFTSDYSKYLDSRRAQDFV
1,G3HZV5,Subfamily,Cricetinae,Cell adhesion,biological_process,MKAGAKNPVVRVFIVDTTYPHHVGPIEVPVPEMIASRFYSSNEFEG...,True,HADGSFSDEMNTILDSLATRDFI
2,G3HZV5,Genus,Cricetulus,Cell adhesion,biological_process,MKAGAKNPVVRVFIVDTTYPHHVGPIEVPVPEMIASRFYSSNEFEG...,True,HSQGTFTSDYSKYLDSRRAQDFV
3,G3HZV5,Genus,Cricetulus,Cell adhesion,biological_process,MKAGAKNPVVRVFIVDTTYPHHVGPIEVPVPEMIASRFYSSNEFEG...,True,HADGSFSDEMNTILDSLATRDFI
4,A0A556VW44,Subfamily,Sisorinae,Cell adhesion,biological_process,MFSSSHTMWWSPSGKFLAFAQFNDSDVHVIEYSWYGQGQYPETIAI...,True,HSEGTFSNDYSKYLETRRAQDFV
...,...,...,...,...,...,...,...,...
54133,A9QLJ1,Suborder,Anabantoidei,,,MASSSKATLILLIYGILMHYSVFCTPIGLSYPKIRLENDAFDEDGN...,True,HSDGIFTDSYSRYRKQMAVKKYL
54134,A0A2D0QK13,Genus,Ictalurus,,,MFKAMLHRSASQLLFLVAICGVFYTRTLSLPLATTRATRHADGLFT...,True,HSDAIFTDNYSRFRKQMAVKKYL
54135,A0A452T8T2,Family,Ursidae,,,SLLTLKPFSGIETSGARCIALIPRYADAIFTNSYRKVLGQLSARKL...,True,YADAIFTNSYRKVLGQLSARKLL
54136,Q3UYH8,Genus,Mus,,,MTMCSGARLALLVYGIIMHSSVSCSPAAGLSFPGIRPEDEAYDQDG...,True,HSDGIFTDSYSRYRKQMAVKKYL


In [58]:
data_wide = data.pivot(index=['entry_id', 'sequence', 'reviewed', 'keyword', 'keyword_category', 'subsequence'], columns='taxonomic_rank', values='taxon')
data_wide = data_wide.reset_index(level=['entry_id', 'sequence', 'reviewed', 'keyword', 'keyword_category', 'subsequence'])
data_wide.columns.names = ['']

In [59]:
with pd.option_context('display.max_columns', None):
    display(data_wide)

Unnamed: 0,entry_id,sequence,reviewed,keyword,keyword_category,subsequence,Class,Cohort,Family,Genus,Infraclass,Infraorder,Kingdom,Order,Parvorder,Phylum,Species,Species_Group,Subclass,Subcohort,Subfamily,Subgenus,Subkingdom,Suborder,Subphylum,Superclass,Superfamily,Superkingdom,Superorder,Tribe
0,A0A060VPW7,MSSKATLALLIYGIIMHYSIHCSPLGLSYPNLRLENEVYDEDGNSL...,True,Reference proteome,technical_term,HSDGIFTDSYSRYRKQMAVKKYL,Actinopteri,Euteleosteomorpha,Salmonidae,Oncorhynchus,Teleostei,,Metazoa,Salmoniformes,,Chordata,,,Neopterygii,,Salmoninae,,,,Craniata,Actinopterygii,,Eukaryota,,
1,A0A060VPW7,MSSKATLALLIYGIIMHYSIHCSPLGLSYPNLRLENEVYDEDGNSL...,True,Signal,domain,HSDGIFTDSYSRYRKQMAVKKYL,Actinopteri,Euteleosteomorpha,Salmonidae,Oncorhynchus,Teleostei,,Metazoa,Salmoniformes,,Chordata,,,Neopterygii,,Salmoninae,,,,Craniata,Actinopterygii,,Eukaryota,,
2,A0A060VXS0,MKGTHSFAGLLLLIIAQSSLQIPQEDTEDNSSLLTEDSMFSEPREL...,True,Reference proteome,technical_term,HAEGTYTSDMSSYLQDQAAKEFV,Actinopteri,Euteleosteomorpha,Salmonidae,Oncorhynchus,Teleostei,,Metazoa,Salmoniformes,,Chordata,,,Neopterygii,,Salmoninae,,,,Craniata,Actinopterygii,,Eukaryota,,
3,A0A060VXS0,MKGTHSFAGLLLLIIAQSSLQIPQEDTEDNSSLLTEDSMFSEPREL...,True,Reference proteome,technical_term,HSEGTFSNDYSKYLQSRRVQDFV,Actinopteri,Euteleosteomorpha,Salmonidae,Oncorhynchus,Teleostei,,Metazoa,Salmoniformes,,Chordata,,,Neopterygii,,Salmoninae,,,,Craniata,Actinopterygii,,Eukaryota,,
4,A0A060VXS0,MKGTHSFAGLLLLIIAQSSLQIPQEDTEDNSSLLTEDSMFSEPREL...,True,Signal,domain,HAEGTYTSDMSSYLQDQAAKEFV,Actinopteri,Euteleosteomorpha,Salmonidae,Oncorhynchus,Teleostei,,Metazoa,Salmoniformes,,Chordata,,,Neopterygii,,Salmoninae,,,,Craniata,Actinopterygii,,Eukaryota,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4275,W5PIG8,MLTRKSVLKPRSDRFLLTAQINRSRGLVTSVITTGSSRRNRSATLS...,True,Amidation,post_translational_modification,HSDGIFTDSYSRYRKQMAVKKYL,Mammalia,,Bovidae,Ovis,,Pecora,Metazoa,Artiodactyla,,Chordata,,,,,Caprinae,,,Ruminantia,Craniata,Sarcopterygii,,Eukaryota,Laurasiatheria,
4276,W5PIG8,MLTRKSVLKPRSDRFLLTAQINRSRGLVTSVITTGSSRRNRSATLS...,True,Reference proteome,technical_term,HSDGIFTDSYSRYRKQMAVKKYL,Mammalia,,Bovidae,Ovis,,Pecora,Metazoa,Artiodactyla,,Chordata,,,,,Caprinae,,,Ruminantia,Craniata,Sarcopterygii,,Eukaryota,Laurasiatheria,
4277,W5Q8U4,MLLWVFFLVTLTLSSGSQGSLPSQPLRIPRYADAIFTNSYRKILGQ...,True,Amidation,post_translational_modification,YADAIFTNSYRKILGQLSARKLL,Mammalia,,Bovidae,Ovis,,Pecora,Metazoa,Artiodactyla,,Chordata,,,,,Caprinae,,,Ruminantia,Craniata,Sarcopterygii,,Eukaryota,Laurasiatheria,
4278,W5Q8U4,MLLWVFFLVTLTLSSGSQGSLPSQPLRIPRYADAIFTNSYRKILGQ...,True,Reference proteome,technical_term,YADAIFTNSYRKILGQLSARKLL,Mammalia,,Bovidae,Ovis,,Pecora,Metazoa,Artiodactyla,,Chordata,,,,,Caprinae,,,Ruminantia,Craniata,Sarcopterygii,,Eukaryota,Laurasiatheria,


In [60]:
data_wide['entry_id'].nunique()

1824

In [61]:
data_wide['keyword'].unique()

array(['Reference proteome', 'Signal', 'Secreted',
       'Cleavage on pair of basic residues', 'Hormone', 'Coiled coil',
       'Transmembrane helix', 'Neuropeptide', 'Proteomics identification',
       'Cell adhesion', 'Hydrolase', nan, 'DNA-binding', 'Sigma factor',
       'Ion transport', 'Chromosome', 'DNA condensation', 'Mitosis',
       'Nucleus', 'Repeat', 'DNA recombination', 'Transposable element',
       'Transposition', 'Cell membrane', 'Cell projection',
       'Disulfide bond', 'Amidation', 'Transport', 'Transferase',
       '3D-structure'], dtype=object)

In [62]:
# Create Spark session
from pyspark.sql import *
from pyspark.sql import functions as f
from pyspark.sql.types import *

spark = SparkSession.builder.appName('clean_uniprot_data').getOrCreate()
# Access the Spark UI on http://127.0.0.1:4040/

In [63]:
data = spark.read.option('header', True).load('data/data.csv', format='csv', delimiter='\t')
data = data.select([f.col(column).alias(column.strip()) for column in data.columns])
data = data.withColumn('reviewed', f.when(f.col('reviewed')=='true', True).otherwise(False))

In [64]:
data.show(n=5, truncate=False, vertical=True)

-RECORD 0------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
 entry_id         | G3HZV5                                                                                                                                                                                                                                                                                              

In [65]:
data.printSchema()

root
 |-- entry_id: string (nullable = true)
 |-- taxonomic_rank: string (nullable = true)
 |-- taxon: string (nullable = true)
 |-- keyword: string (nullable = true)
 |-- keyword_category: string (nullable = true)
 |-- sequence: string (nullable = true)
 |-- reviewed: boolean (nullable = false)
 |-- subsequence: string (nullable = true)



In [66]:
data = data.groupBy('entry_id', 'keyword', 'keyword_category', 'sequence', 'reviewed', 'subsequence').pivot('taxonomic_rank').agg(f.first('taxon'))

In [67]:
data.show(n=5, truncate=False, vertical=True)

-RECORD 0--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
 entry_id         | A0A090BZQ0                                                                                                                                                                       
 keyword          | Signal                                                                                                                                                                           
 keyword_category | domain                                                                                                                                                                           
 sequence         | MESRSKPQFLAILTLFSVLFSQSLAWPLYGPPSSVRLDDRLQFEGAGDPDQVSLKADSDILQNALAENDTPYYDVSRNARHADGVFTSDYSRLLGQISAKKYLESLIGKRISSSISEDPVPVKRHSDAVFTDNYTRLRKQMAVKKYLNSILNGKRSSEGDSPDFLEELEK       
 reviewed 

In [68]:
data.select('keyword').distinct().show(n=100 ,truncate=False)

+----------------------------------+
|keyword                           |
+----------------------------------+
|Hormone                           |
|Coiled coil                       |
|null                              |
|Sigma factor                      |
|DNA recombination                 |
|Chromosome                        |
|Proteomics identification         |
|Transmembrane helix               |
|Cell adhesion                     |
|Nucleus                           |
|Neuropeptide                      |
|DNA-binding                       |
|Repeat                            |
|Amidation                         |
|Ion transport                     |
|Transposable element              |
|Hydrolase                         |
|Transport                         |
|Transferase                       |
|3D-structure                      |
|Signal                            |
|Cleavage on pair of basic residues|
|Reference proteome                |
|Cell projection                   |
|

In [69]:
data.select(f.countDistinct('entry_id')).show()

+------------------------+
|count(DISTINCT entry_id)|
+------------------------+
|                    1824|
+------------------------+

