In [1]:
import requests
import pandas as pd

In [4]:
url = 'https://sparql.uniprot.org/sparql/'
query = '''
PREFIX up: <http://purl.uniprot.org/core/>
PREFIX keywords: <http://purl.uniprot.org/keywords/>
PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
PREFIX skos: <http://www.w3.org/2004/02/skos/core#>
PREFIX faldo: <http://biohackathon.org/resource/faldo#>
SELECT DISTINCT
    (CONCAT(SUBSTR(STR(?protein), 33)) AS ?entry_id)
    ?taxonomic_rank
    ?taxon
    ?keyword
    ?keyword_category
    ?go_term
    ?go_category
    ?sequence
    ?subsequence
    ?reviewed
WHERE
{
    ?protein a up:Protein .
    ?protein up:annotation ?annotaion .
    ?annotaion rdf:type up:Domain_Extent_Annotation .
    ?annotaion rdfs:comment 'GLUCAGON' .
    ?protein up:sequence ?sequence_class .
    FILTER NOT EXISTS { ?sequence_class up:basedOn ?primary_sequence }
    #FILTER NOT EXISTS { ?sequence_class up:precursor true }
    FILTER NOT EXISTS { ?sequence_class up:fragment 'single' }
    FILTER NOT EXISTS { ?sequence_class up:fragment 'multiple' }
    ?sequence_class rdf:value ?sequence .
    ?annotaion up:range ?range .
    ?range faldo:end ?end_class .
    ?range faldo:begin ?begin_class .
    ?end_class faldo:position ?end .
    ?begin_class faldo:position ?begin .
    BIND(SUBSTR(?sequence, ?begin, ?end-?begin+1) AS ?subsequence)
    OPTIONAL {
        ?protein up:organism ?organism .
        ?organism rdfs:subClassOf ?ancestor .
        ?ancestor up:rank ?taxonomic_rank_class .
        BIND(SUBSTR(STR(?taxonomic_rank_class), 30) AS ?taxonomic_rank) .
        ?ancestor up:scientificName ?taxon .
    }
    OPTIONAL {
        ?protein up:classifiedWith ?keyword_class .
        {
            ?keyword_class rdfs:subClassOf keywords:9999 .
            BIND('biological_process' AS ?keyword_category)
        } UNION {
            ?keyword_class rdfs:subClassOf keywords:9998 .
            BIND('cellular_component' AS ?keyword_category)
        } UNION {
            ?keyword_class rdfs:subClassOf keywords:9997 .
            BIND('coding_sequence_diversity' AS ?keyword_category)
        } UNION {
            ?keywords_class rdfs:subClassOf keywords:9996 .
            BIND('developmental_stage' AS ?keyword_category)
        } UNION {
            ?keyword_class rdfs:subClassOf keywords:9995 .
            BIND('disease' AS ?keyword_category)
        } UNION {
            ?keyword_class rdfs:subClassOf keywords:9994 .
            BIND('domain' AS ?keyword_category)
        } UNION {
            ?keyword_class rdfs:subClassOf keywords:9993 .
            BIND('ligand' AS ?keyword_category)
        } UNION {
            ?keyword_class rdfs:subClassOf keywords:9992 .
            BIND('molecular_function' AS ?keyword_category)
        } UNION {
            ?keyword_class rdfs:subClassOf keywords:9991 .
            BIND('post_translational_modification' AS ?keyword_category)
        } UNION {
            ?keyword_class rdfs:subClassOf keywords:9990 .
            BIND('technical_term' AS ?keyword_category)
        }
        ?keyword_class skos:prefLabel ?keyword .
    }
    OPTIONAL { ?protein up:reviewed ?reviewed_true . }
    BIND(IF(BOUND(?reviewed_true), 'true', 'false') AS ?reviewed)
    OPTIONAL {
        ?protein up:classifiedWith ?go_term_class .
        {
            ?go_term_class rdfs:subClassOf <http://purl.obolibrary.org/obo/GO_0008150> .
            BIND("biological process" AS ?go_category)
        } UNION {
            ?go_term_class rdfs:subClassOf <http://purl.obolibrary.org/obo/GO_0005575> .
            BIND("molecular function" AS ?go_category)
        } UNION {
            ?go_term_class rdfs:subClassOf <http://purl.obolibrary.org/obo/GO_0003674> .
            BIND("cellular component" AS ?go_category)
        }
        ?go_term_class rdfs:label ?go_term .
    }
}
'''
r = requests.get(url, params={'format': 'tab', 'query': query}, headers={'From': 'nilasschusler@gmail.com'}, stream=True)

In [5]:
outfile = open('data/data.csv','wb')
for chunk in r.iter_content(chunk_size=128):
    outfile.write(chunk)
outfile.close()

In [6]:
data = pd.read_csv('data/data.csv', sep='\t')
data.columns = [column.strip() for column in data.columns]
data

Unnamed: 0,entry_id,taxonomic_rank,taxon,keyword,keyword_category,go_term,go_category,sequence,subsequence,reviewed
0,G3HZV5,Subfamily,Cricetinae,Cell adhesion,biological_process,cell adhesion,biological process,MKAGAKNPVVRVFIVDTTYPHHVGPIEVPVPEMIASRFYSSNEFEG...,HSQGTFTSDYSKYLDSRRAQDFV,True
1,G3HZV5,Subfamily,Cricetinae,Cell adhesion,biological_process,cell adhesion,biological process,MKAGAKNPVVRVFIVDTTYPHHVGPIEVPVPEMIASRFYSSNEFEG...,HADGSFSDEMNTILDSLATRDFI,True
2,G3HZV5,Genus,Cricetulus,Cell adhesion,biological_process,cell adhesion,biological process,MKAGAKNPVVRVFIVDTTYPHHVGPIEVPVPEMIASRFYSSNEFEG...,HSQGTFTSDYSKYLDSRRAQDFV,True
3,G3HZV5,Genus,Cricetulus,Cell adhesion,biological_process,cell adhesion,biological process,MKAGAKNPVVRVFIVDTTYPHHVGPIEVPVPEMIASRFYSSNEFEG...,HADGSFSDEMNTILDSLATRDFI,True
4,A0A151M097,Order,Crocodylia,Cell adhesion,biological_process,cell adhesion,biological process,MDKVSTLYIEYPKAMKMKSVYFVAGLLLMIVQGSWQNPLQDTEEKS...,HAEFERHAEGTYTSDITSYLEGQ,True
...,...,...,...,...,...,...,...,...,...,...
281394,A0A6F9AX47,Order,Salmoniformes,,,,,MDCKCIQCIEETEREKPEDIYQRTAGSQAQTPPFSPTLNRGNTGGQ...,HAEGTYTSDVSSYLQDQAAKEFV,True
281395,A0A6F9AX47,Family,Salmonidae,,,,,MDCKCIQCIEETEREKPEDIYQRTAGSQAQTPPFSPTLNRGNTGGQ...,HSEGTFSNDYSKYLESRRAQDFV,True
281396,A0A6F9AX47,Family,Salmonidae,,,,,MDCKCIQCIEETEREKPEDIYQRTAGSQAQTPPFSPTLNRGNTGGQ...,HAEGTYTSDVSSYLQDQAAKEFV,True
281397,A0A6F9AX47,Subphylum,Craniata,,,,,MDCKCIQCIEETEREKPEDIYQRTAGSQAQTPPFSPTLNRGNTGGQ...,HSEGTFSNDYSKYLESRRAQDFV,True


In [8]:
data_wide = data.pivot(index=['entry_id', 'sequence', 'reviewed', 'keyword', 'keyword_category', 'subsequence', 'go_term', 'go_category'], columns='taxonomic_rank', values='taxon')
data_wide = data_wide.reset_index(level=['entry_id', 'sequence', 'reviewed', 'keyword', 'keyword_category', 'subsequence', 'go_term', 'go_category'])
data_wide.columns.names = ['']

In [9]:
with pd.option_context('display.max_columns', None):
    display(data_wide)

Unnamed: 0,entry_id,sequence,reviewed,keyword,keyword_category,subsequence,go_term,go_category,Class,Cohort,Family,Genus,Infraclass,Infraorder,Kingdom,Order,Parvorder,Phylum,Species,Species_Group,Subclass,Subcohort,Subfamily,Subgenus,Subkingdom,Suborder,Subphylum,Superclass,Superfamily,Superkingdom,Superorder,Tribe
0,A0A060VPW7,MSSKATLALLIYGIIMHYSIHCSPLGLSYPNLRLENEVYDEDGNSL...,True,Reference proteome,technical_term,HSDGIFTDSYSRYRKQMAVKKYL,extracellular region,molecular function,Actinopteri,Euteleosteomorpha,Salmonidae,Oncorhynchus,Teleostei,,Metazoa,Salmoniformes,,Chordata,,,Neopterygii,,Salmoninae,,,,Craniata,Actinopterygii,,Eukaryota,,
1,A0A060VPW7,MSSKATLALLIYGIIMHYSIHCSPLGLSYPNLRLENEVYDEDGNSL...,True,Reference proteome,technical_term,HSDGIFTDSYSRYRKQMAVKKYL,hormone activity,cellular component,Actinopteri,Euteleosteomorpha,Salmonidae,Oncorhynchus,Teleostei,,Metazoa,Salmoniformes,,Chordata,,,Neopterygii,,Salmoninae,,,,Craniata,Actinopterygii,,Eukaryota,,
2,A0A060VPW7,MSSKATLALLIYGIIMHYSIHCSPLGLSYPNLRLENEVYDEDGNSL...,True,Signal,domain,HSDGIFTDSYSRYRKQMAVKKYL,extracellular region,molecular function,Actinopteri,Euteleosteomorpha,Salmonidae,Oncorhynchus,Teleostei,,Metazoa,Salmoniformes,,Chordata,,,Neopterygii,,Salmoninae,,,,Craniata,Actinopterygii,,Eukaryota,,
3,A0A060VPW7,MSSKATLALLIYGIIMHYSIHCSPLGLSYPNLRLENEVYDEDGNSL...,True,Signal,domain,HSDGIFTDSYSRYRKQMAVKKYL,hormone activity,cellular component,Actinopteri,Euteleosteomorpha,Salmonidae,Oncorhynchus,Teleostei,,Metazoa,Salmoniformes,,Chordata,,,Neopterygii,,Salmoninae,,,,Craniata,Actinopterygii,,Eukaryota,,
4,A0A060VXS0,MKGTHSFAGLLLLIIAQSSLQIPQEDTEDNSSLLTEDSMFSEPREL...,True,Reference proteome,technical_term,HAEGTYTSDMSSYLQDQAAKEFV,extracellular region,molecular function,Actinopteri,Euteleosteomorpha,Salmonidae,Oncorhynchus,Teleostei,,Metazoa,Salmoniformes,,Chordata,,,Neopterygii,,Salmoninae,,,,Craniata,Actinopterygii,,Eukaryota,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
22249,W5Q8U4,MLLWVFFLVTLTLSSGSQGSLPSQPLRIPRYADAIFTNSYRKILGQ...,True,Signal,domain,YADAIFTNSYRKILGQLSARKLL,positive regulation of cell population prolife...,biological process,Mammalia,,Bovidae,Ovis,,Pecora,Metazoa,Artiodactyla,,Chordata,,,,,Caprinae,,,Ruminantia,Craniata,Sarcopterygii,,Eukaryota,Laurasiatheria,
22250,W5Q8U4,MLLWVFFLVTLTLSSGSQGSLPSQPLRIPRYADAIFTNSYRKILGQ...,True,Signal,domain,YADAIFTNSYRKILGQLSARKLL,positive regulation of growth hormone secretion,biological process,Mammalia,,Bovidae,Ovis,,Pecora,Metazoa,Artiodactyla,,Chordata,,,,,Caprinae,,,Ruminantia,Craniata,Sarcopterygii,,Eukaryota,Laurasiatheria,
22251,W5Q8U4,MLLWVFFLVTLTLSSGSQGSLPSQPLRIPRYADAIFTNSYRKILGQ...,True,Signal,domain,YADAIFTNSYRKILGQLSARKLL,positive regulation of multicellular organism ...,biological process,Mammalia,,Bovidae,Ovis,,Pecora,Metazoa,Artiodactyla,,Chordata,,,,,Caprinae,,,Ruminantia,Craniata,Sarcopterygii,,Eukaryota,Laurasiatheria,
22252,W5Q8U4,MLLWVFFLVTLTLSSGSQGSLPSQPLRIPRYADAIFTNSYRKILGQ...,True,Signal,domain,YADAIFTNSYRKILGQLSARKLL,response to food,biological process,Mammalia,,Bovidae,Ovis,,Pecora,Metazoa,Artiodactyla,,Chordata,,,,,Caprinae,,,Ruminantia,Craniata,Sarcopterygii,,Eukaryota,Laurasiatheria,


In [10]:
data_wide['entry_id'].nunique()

1824

In [11]:
data_wide['keyword'].unique()

array(['Reference proteome', 'Signal', 'Secreted',
       'Cleavage on pair of basic residues', 'Hormone', 'Coiled coil',
       'Transmembrane helix', 'Neuropeptide', 'Proteomics identification',
       'Cell adhesion', 'Hydrolase', nan, 'DNA-binding', 'Sigma factor',
       'Ion transport', 'Chromosome', 'DNA condensation', 'Mitosis',
       'Nucleus', 'Repeat', 'DNA recombination', 'Transposable element',
       'Transposition', 'Cell membrane', 'Cell projection',
       'Disulfide bond', 'Amidation', 'Transport', 'Transferase',
       '3D-structure'], dtype=object)

In [13]:
data_wide['go_term'].unique()

array(['extracellular region', 'hormone activity',
       'regulation of fatty acid biosynthetic process',
       'regulation of insulin secretion', 'response to glucose',
       'mRNA stabilization', 'prolactin secretion', 'brain development',
       'camera-type eye development',
       'adenylate cyclase-modulating G protein-coupled receptor signaling pathway',
       'phospholipase C-activating G protein-coupled receptor signaling pathway',
       'G protein-coupled receptor signaling pathway',
       'insulin secretion',
       'pituitary adenylate cyclase activating polypeptide activity',
       'pituitary adenylate cyclase-activating polypeptide receptor binding',
       'ESCRT II complex', 'multivesicular body sorting pathway',
       'adenylate cyclase-activating G protein-coupled receptor signaling pathway',
       'cytoplasm', 'extracellular space', 'glucose homeostasis',
       'identical protein binding',
       'negative regulation of execution phase of apoptosis',
      

In [17]:
# Create Spark session
from pyspark.sql import *
from pyspark.sql import functions as f
from pyspark.sql.types import *

spark = SparkSession.builder.appName('clean_uniprot_data').getOrCreate()
# Access the Spark UI on http://127.0.0.1:4040/

In [18]:
data = spark.read.option('header', True).load('data/data.csv', format='csv', delimiter='\t')
data = data.select([f.col(column).alias(column.strip()) for column in data.columns])
data = data.withColumn('reviewed', f.when(f.col('reviewed')=='true', True).otherwise(False))

In [19]:
data.show(n=5, truncate=False, vertical=True)

-RECORD 0-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
 entry_id  

In [20]:
data.printSchema()

root
 |-- entry_id: string (nullable = true)
 |-- taxonomic_rank: string (nullable = true)
 |-- taxon: string (nullable = true)
 |-- keyword: string (nullable = true)
 |-- keyword_category: string (nullable = true)
 |-- go_term: string (nullable = true)
 |-- go_category: string (nullable = true)
 |-- sequence: string (nullable = true)
 |-- subsequence: string (nullable = true)
 |-- reviewed: boolean (nullable = false)



In [21]:
data = data.groupBy('entry_id', 'keyword', 'keyword_category', 'sequence', 'reviewed', 'subsequence', 'go_term', 'go_category').pivot('taxonomic_rank').agg(f.first('taxon'))

In [22]:
data.show(n=5, truncate=False, vertical=True)

-RECORD 0-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

In [23]:
data.select('keyword').distinct().show(n=100 ,truncate=False)

+----------------------------------+
|keyword                           |
+----------------------------------+
|Hormone                           |
|Coiled coil                       |
|null                              |
|Sigma factor                      |
|DNA recombination                 |
|Chromosome                        |
|Proteomics identification         |
|Transmembrane helix               |
|Cell adhesion                     |
|Nucleus                           |
|Neuropeptide                      |
|DNA-binding                       |
|Repeat                            |
|Amidation                         |
|Ion transport                     |
|Transposable element              |
|Hydrolase                         |
|Transport                         |
|Transferase                       |
|3D-structure                      |
|Signal                            |
|Cleavage on pair of basic residues|
|Reference proteome                |
|Cell projection                   |
|

In [25]:
data.select('go_term').distinct().show(n=200 ,truncate=False)

+------------------------------------------------------------------------------------------+
|go_term                                                                                   |
+------------------------------------------------------------------------------------------+
|negative regulation of potassium ion transport                                            |
|ESCRT II complex                                                                          |
|negative regulation of acute inflammatory response to antigenic stimulus                  |
|digestive hormone activity                                                                |
|signaling receptor binding                                                                |
|dipeptidyl-peptidase activity                                                             |
|negative regulation of extracellular matrix disassembly                                   |
|positive regulation of peptidyl-serine phosphorylation               

In [26]:
data.select(f.countDistinct('entry_id')).show()

+------------------------+
|count(DISTINCT entry_id)|
+------------------------+
|                    1824|
+------------------------+

