In [1]:
import requests
import pandas as pd

In [97]:
url = 'https://sparql.uniprot.org/sparql/'
query = '''
PREFIX up: <http://purl.uniprot.org/core/>
PREFIX keywords: <http://purl.uniprot.org/keywords/>
PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
PREFIX skos: <http://www.w3.org/2004/02/skos/core#>
PREFIX faldo: <http://biohackathon.org/resource/faldo#>
SELECT DISTINCT
    (CONCAT(SUBSTR(STR(?protein), 33)) AS ?entry_id)
    ?protein_family
    ?taxonomic_rank
    ?taxon
    ?keyword
    ?keyword_category
    ?go_term
    ?go_category
    ?sequence
    ?reviewed
WHERE
{
    ?protein a up:Protein .
    {
        ?protein rdfs:seeAlso <http://purl.uniprot.org/interpro/IPR000532> .
        BIND('glucagon_gip_secretin_vip' AS ?protein_family)
    } UNION {
        ?protein rdfs:seeAlso <http://purl.uniprot.org/interpro/IPR009136> .
        BIND('vegfr2_rcpt' AS ?protein_family)
    }
    ?protein up:sequence ?sequence_class .
    FILTER NOT EXISTS { ?sequence_class up:basedOn ?primary_sequence }
    #FILTER NOT EXISTS { ?sequence_class up:precursor true }
    FILTER NOT EXISTS { ?sequence_class up:fragment 'single' }
    FILTER NOT EXISTS { ?sequence_class up:fragment 'multiple' }
    ?sequence_class rdf:value ?sequence .
    OPTIONAL {
        ?protein up:organism ?organism .
        ?organism rdfs:subClassOf ?ancestor .
        ?ancestor up:rank ?taxonomic_rank_class .
        BIND(SUBSTR(STR(?taxonomic_rank_class), 30) AS ?taxonomic_rank) .
        ?ancestor up:scientificName ?taxon .
    }
    OPTIONAL {
        ?protein up:classifiedWith ?keyword_class .
        {
            ?keyword_class rdfs:subClassOf keywords:9999 .
            BIND('biological_process' AS ?keyword_category)
        } UNION {
            ?keyword_class rdfs:subClassOf keywords:9998 .
            BIND('cellular_component' AS ?keyword_category)
        } UNION {
            ?keyword_class rdfs:subClassOf keywords:9997 .
            BIND('coding_sequence_diversity' AS ?keyword_category)
        } UNION {
            ?keywords_class rdfs:subClassOf keywords:9996 .
            BIND('developmental_stage' AS ?keyword_category)
        } UNION {
            ?keyword_class rdfs:subClassOf keywords:9995 .
            BIND('disease' AS ?keyword_category)
        } UNION {
            ?keyword_class rdfs:subClassOf keywords:9994 .
            BIND('domain' AS ?keyword_category)
        } UNION {
            ?keyword_class rdfs:subClassOf keywords:9993 .
            BIND('ligand' AS ?keyword_category)
        } UNION {
            ?keyword_class rdfs:subClassOf keywords:9992 .
            BIND('molecular_function' AS ?keyword_category)
        } UNION {
            ?keyword_class rdfs:subClassOf keywords:9991 .
            BIND('post_translational_modification' AS ?keyword_category)
        } UNION {
            ?keyword_class rdfs:subClassOf keywords:9990 .
            BIND('technical_term' AS ?keyword_category)
        }
        ?keyword_class skos:prefLabel ?keyword .
    }
    OPTIONAL { ?protein up:reviewed ?reviewed_true . }
    BIND(IF(BOUND(?reviewed_true), 'true', 'false') AS ?reviewed)
    OPTIONAL {
        ?protein up:classifiedWith ?go_term_class .
        {
            ?go_term_class rdfs:subClassOf <http://purl.obolibrary.org/obo/GO_0008150> .
            BIND("biological process" AS ?go_category)
        } UNION {
            ?go_term_class rdfs:subClassOf <http://purl.obolibrary.org/obo/GO_0005575> .
            BIND("molecular function" AS ?go_category)
        } UNION {
            ?go_term_class rdfs:subClassOf <http://purl.obolibrary.org/obo/GO_0003674> .
            BIND("cellular component" AS ?go_category)
        }
        ?go_term_class rdfs:label ?go_term .
    }
}
'''
r = requests.get(url, params={'format': 'tab', 'query': query}, headers={'From': 'nilasschusler@gmail.com'}, stream=True)

In [None]:
# Code for getting subsequences:
"""
?protein up:annotation ?annotaion .
?annotaion rdf:type up:Domain_Extent_Annotation .
?annotaion rdfs:comment 'GLUCAGON' .

?annotaion up:range ?range .
?range faldo:end ?end_class .
?range faldo:begin ?begin_class .
?end_class faldo:position ?end .
?begin_class faldo:position ?begin .
BIND(SUBSTR(?sequence, ?begin, ?end-?begin+1) AS ?subsequence)
"""

In [98]:
outfile = open('data/data.csv','wb')
for chunk in r.iter_content(chunk_size=128):
    outfile.write(chunk)
outfile.close()

In [99]:
data = pd.read_csv('data/data.csv', sep='\t')
data.columns = [column.strip() for column in data.columns]
data

Unnamed: 0,entry_id,protein_family,taxonomic_rank,taxon,keyword,keyword_category,go_term,go_category,sequence,reviewed
0,P16613,glucagon_gip_secretin_vip,Family,Bovidae,Neurogenesis,biological_process,insulin secretion,biological process,MTMCSGARLALLVYGILMHSSVYGSPAASGLRFPGIRPENEAYDED...,True
1,P16613,glucagon_gip_secretin_vip,Superkingdom,Eukaryota,Neurogenesis,biological_process,insulin secretion,biological process,MTMCSGARLALLVYGILMHSSVYGSPAASGLRFPGIRPENEAYDED...,True
2,P16613,glucagon_gip_secretin_vip,Superorder,Laurasiatheria,Neurogenesis,biological_process,insulin secretion,biological process,MTMCSGARLALLVYGILMHSSVYGSPAASGLRFPGIRPENEAYDED...,True
3,P16613,glucagon_gip_secretin_vip,Kingdom,Metazoa,Neurogenesis,biological_process,insulin secretion,biological process,MTMCSGARLALLVYGILMHSSVYGSPAASGLRFPGIRPENEAYDED...,True
4,P16613,glucagon_gip_secretin_vip,Infraorder,Pecora,Neurogenesis,biological_process,insulin secretion,biological process,MTMCSGARLALLVYGILMHSSVYGSPAASGLRFPGIRPENEAYDED...,True
...,...,...,...,...,...,...,...,...,...,...
1344874,A0A6F9AK13,vegfr2_rcpt,Phylum,Chordata,,,,,MLDCWLDKPTDRPNFTELVEHLGNLLQASAHQDGKDYIPLTAVEVE...,True
1344875,A0A6F9AK13,vegfr2_rcpt,Superclass,Actinopterygii,,,,,MLDCWLDKPTDRPNFTELVEHLGNLLQASAHQDGKDYIPLTAVEVE...,True
1344876,A0A6F9AK13,vegfr2_rcpt,Order,Salmoniformes,,,,,MLDCWLDKPTDRPNFTELVEHLGNLLQASAHQDGKDYIPLTAVEVE...,True
1344877,A0A6F9AK13,vegfr2_rcpt,Family,Salmonidae,,,,,MLDCWLDKPTDRPNFTELVEHLGNLLQASAHQDGKDYIPLTAVEVE...,True


In [100]:
data_wide = data.pivot(index=['entry_id', 'protein_family', 'sequence', 'reviewed', 'keyword', 'keyword_category', 'go_term', 'go_category'], columns='taxonomic_rank', values='taxon')
data_wide = data_wide.reset_index(level=['entry_id', 'protein_family', 'sequence', 'reviewed', 'keyword', 'keyword_category', 'go_term', 'go_category'])
data_wide.columns.names = ['']

In [101]:
with pd.option_context('display.max_columns', None):
    display(data_wide)

Unnamed: 0,entry_id,protein_family,sequence,reviewed,keyword,keyword_category,go_term,go_category,Class,Cohort,Family,Genus,Infraclass,Infraorder,Kingdom,Order,Parvorder,Phylum,Species,Species_Group,Species_Subgroup,Subclass,Subcohort,Subfamily,Subgenus,Subkingdom,Suborder,Subphylum,Superclass,Superfamily,Superkingdom,Superorder,Tribe
0,A0A060VPW7,glucagon_gip_secretin_vip,MSSKATLALLIYGIIMHYSIHCSPLGLSYPNLRLENEVYDEDGNSL...,True,Reference proteome,technical_term,extracellular region,molecular function,Actinopteri,Euteleosteomorpha,Salmonidae,Oncorhynchus,Teleostei,,Metazoa,Salmoniformes,,Chordata,,,,Neopterygii,,Salmoninae,,,,Craniata,Actinopterygii,,Eukaryota,,
1,A0A060VPW7,glucagon_gip_secretin_vip,MSSKATLALLIYGIIMHYSIHCSPLGLSYPNLRLENEVYDEDGNSL...,True,Reference proteome,technical_term,hormone activity,cellular component,Actinopteri,Euteleosteomorpha,Salmonidae,Oncorhynchus,Teleostei,,Metazoa,Salmoniformes,,Chordata,,,,Neopterygii,,Salmoninae,,,,Craniata,Actinopterygii,,Eukaryota,,
2,A0A060VPW7,glucagon_gip_secretin_vip,MSSKATLALLIYGIIMHYSIHCSPLGLSYPNLRLENEVYDEDGNSL...,True,Signal,domain,extracellular region,molecular function,Actinopteri,Euteleosteomorpha,Salmonidae,Oncorhynchus,Teleostei,,Metazoa,Salmoniformes,,Chordata,,,,Neopterygii,,Salmoninae,,,,Craniata,Actinopterygii,,Eukaryota,,
3,A0A060VPW7,glucagon_gip_secretin_vip,MSSKATLALLIYGIIMHYSIHCSPLGLSYPNLRLENEVYDEDGNSL...,True,Signal,domain,hormone activity,cellular component,Actinopteri,Euteleosteomorpha,Salmonidae,Oncorhynchus,Teleostei,,Metazoa,Salmoniformes,,Chordata,,,,Neopterygii,,Salmoninae,,,,Craniata,Actinopterygii,,Eukaryota,,
4,A0A060VXS0,glucagon_gip_secretin_vip,MKGTHSFAGLLLLIIAQSSLQIPQEDTEDNSSLLTEDSMFSEPREL...,True,Reference proteome,technical_term,extracellular region,molecular function,Actinopteri,Euteleosteomorpha,Salmonidae,Oncorhynchus,Teleostei,,Metazoa,Salmoniformes,,Chordata,,,,Neopterygii,,Salmoninae,,,,Craniata,Actinopterygii,,Eukaryota,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
104408,W5Q8U4,glucagon_gip_secretin_vip,MLLWVFFLVTLTLSSGSQGSLPSQPLRIPRYADAIFTNSYRKILGQ...,True,Signal,domain,positive regulation of cell population prolife...,biological process,Mammalia,,Bovidae,Ovis,,Pecora,Metazoa,Artiodactyla,,Chordata,,,,,,Caprinae,,,Ruminantia,Craniata,Sarcopterygii,,Eukaryota,Laurasiatheria,
104409,W5Q8U4,glucagon_gip_secretin_vip,MLLWVFFLVTLTLSSGSQGSLPSQPLRIPRYADAIFTNSYRKILGQ...,True,Signal,domain,positive regulation of growth hormone secretion,biological process,Mammalia,,Bovidae,Ovis,,Pecora,Metazoa,Artiodactyla,,Chordata,,,,,,Caprinae,,,Ruminantia,Craniata,Sarcopterygii,,Eukaryota,Laurasiatheria,
104410,W5Q8U4,glucagon_gip_secretin_vip,MLLWVFFLVTLTLSSGSQGSLPSQPLRIPRYADAIFTNSYRKILGQ...,True,Signal,domain,positive regulation of multicellular organism ...,biological process,Mammalia,,Bovidae,Ovis,,Pecora,Metazoa,Artiodactyla,,Chordata,,,,,,Caprinae,,,Ruminantia,Craniata,Sarcopterygii,,Eukaryota,Laurasiatheria,
104411,W5Q8U4,glucagon_gip_secretin_vip,MLLWVFFLVTLTLSSGSQGSLPSQPLRIPRYADAIFTNSYRKILGQ...,True,Signal,domain,response to food,biological process,Mammalia,,Bovidae,Ovis,,Pecora,Metazoa,Artiodactyla,,Chordata,,,,,,Caprinae,,,Ruminantia,Craniata,Sarcopterygii,,Eukaryota,Laurasiatheria,


In [102]:
data_wide['entry_id'].nunique()

2395

In [103]:
data_wide['keyword'].unique()

array(['Reference proteome', 'Signal', 'Secreted', 'Angiogenesis',
       'Cell membrane', 'Disulfide bond', 'Endoplasmic reticulum',
       'Endosome', 'Immunoglobulin domain', 'Receptor', 'Repeat',
       'Transmembrane helix', 'Tyrosine-protein kinase',
       'Developmental protein', 'Glycoprotein', 'Phosphoprotein',
       'Ubl conjugation', 'Cleavage on pair of basic residues', 'Hormone',
       'Proteomics identification', 'Coiled coil', 'Transferase',
       'Neuropeptide', 'Cell adhesion', 'Hydrolase', nan, 'DNA-binding',
       'Sigma factor', 'Ion transport', 'Chromosome', 'DNA condensation',
       'Mitosis', 'Nucleus', 'DNA recombination', 'Transposable element',
       'Transposition', 'ATP-binding', 'Cytoplasm', 'Immunity',
       'Cell projection', 'Lyase', 'Amidation', 'Transport',
       'Direct protein sequencing',
       'G-protein coupled receptor impairing toxin', 'Hypotensive agent',
       'G-protein coupled receptor', 'Antiviral defense', 'Helicase',
       'In

In [104]:
data_wide['go_term'].unique()

array(['extracellular region', 'hormone activity',
       'regulation of fatty acid biosynthetic process',
       'regulation of insulin secretion', 'response to glucose',
       'mRNA stabilization', 'prolactin secretion', 'ATP binding',
       'angiogenesis', 'cell junction', 'early endosome',
       'endoplasmic reticulum', 'growth factor binding',
       'integral component of membrane', 'plasma membrane',
       'vascular endothelial growth factor receptor signaling pathway',
       'vascular endothelial growth factor-activated receptor activity',
       'nucleus', 'ERK1 and ERK2 cascade', 'Golgi apparatus',
       'branching involved in blood vessel morphogenesis',
       'cadherin binding',
       'calcium-mediated signaling using intracellular calcium source',
       'cellular response to hydrogen sulfide',
       'identical protein binding',
       'integral component of plasma membrane', 'integrin binding',
       'membrane raft',
       'negative regulation of endothelial ce

In [105]:
# Create Spark session
from pyspark.sql import *
from pyspark.sql import functions as f
from pyspark.sql.types import *

spark = SparkSession.builder.appName('clean_uniprot_data').getOrCreate()
# Access the Spark UI on http://127.0.0.1:4040/

In [106]:
data = spark.read.option('header', True).load('data/data.csv', format='csv', delimiter='\t')
data = data.select([f.col(column).alias(column.strip()) for column in data.columns])
data = data.withColumn('reviewed', f.when(f.col('reviewed')=='true', True).otherwise(False))

In [107]:
data.show(n=5, truncate=False, vertical=True)

-RECORD 0--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
 entry_id         | P16613                                                                                                                                                                           
 protein_family   | glucagon_gip_secretin_vip                                                                                                                                                        
 taxonomic_rank   | Family                                                                                                                                                                           
 taxon            | Bovidae                                                                                                                                                                          
 keyword  

In [108]:
data.printSchema()

root
 |-- entry_id: string (nullable = true)
 |-- protein_family: string (nullable = true)
 |-- taxonomic_rank: string (nullable = true)
 |-- taxon: string (nullable = true)
 |-- keyword: string (nullable = true)
 |-- keyword_category: string (nullable = true)
 |-- go_term: string (nullable = true)
 |-- go_category: string (nullable = true)
 |-- sequence: string (nullable = true)
 |-- reviewed: boolean (nullable = false)



In [109]:
data_taxon = data.groupBy('entry_id').pivot('taxonomic_rank').agg(f.first('taxon'))

In [110]:
data_keyword = data.withColumn('concat_column', f.lower(f.concat_ws('_', f.lit('keyword'), f.col('keyword_category'), f.col('keyword'))) )
data_keyword = data_keyword.withColumn('concat_column', f.translate('concat_column', ' ,', '_') )

data_keyword = data_keyword.groupBy('entry_id').pivot('concat_column').agg(f.first('keyword'))
data_keyword = data_keyword.drop('keyword')

for column_name in data_keyword.columns[1:]:
    data_keyword = data_keyword.withColumn(column_name, f.when(f.col(column_name).isNull(), False).otherwise(True))

In [111]:
data_go_term = data.withColumn('concat_column', f.lower(f.concat_ws('_', f.lit('go_term'), f.col('go_category'), f.col('go_term'))) )
data_go_term = data_go_term.withColumn('concat_column', f.translate('concat_column', ' ,', '_') )

data_go_term = data_go_term.groupBy('entry_id').pivot('concat_column').agg(f.first('go_term'))
data_go_term = data_go_term.drop('go_term')

for column_name in data_go_term.columns[1:]:
    data_go_term = data_go_term.withColumn(column_name, f.when(f.col(column_name).isNull(), False).otherwise(True))

In [112]:
data_rest = data.select('entry_id', 'protein_family' , 'sequence', 'reviewed').dropDuplicates()

In [113]:
data_full = data_rest.join(data_keyword, on=['entry_id'], how='left').join(data_go_term, on=['entry_id'], how='left')

In [114]:
data_full.filter(f.col('entry_id') == 'G3HZV5').show(n=100, truncate=False, vertical=True)

-RECORD 0------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
 entry_id                                                                                                                                                      | G3HZV5                                                                                                             

In [115]:
data_full.count()

2395

In [116]:
data_full.select(f.countDistinct('entry_id')).show()

+------------------------+
|count(DISTINCT entry_id)|
+------------------------+
|                    2395|
+------------------------+



In [117]:
data_full.printSchema()

root
 |-- entry_id: string (nullable = true)
 |-- protein_family: string (nullable = true)
 |-- sequence: string (nullable = true)
 |-- reviewed: boolean (nullable = false)
 |-- keyword_biological_process_angiogenesis: boolean (nullable = true)
 |-- keyword_biological_process_antiviral_defense: boolean (nullable = true)
 |-- keyword_biological_process_cell_adhesion: boolean (nullable = true)
 |-- keyword_biological_process_differentiation: boolean (nullable = true)
 |-- keyword_biological_process_dna_condensation: boolean (nullable = true)
 |-- keyword_biological_process_dna_recombination: boolean (nullable = true)
 |-- keyword_biological_process_host-virus_interaction: boolean (nullable = true)
 |-- keyword_biological_process_immunity: boolean (nullable = true)
 |-- keyword_biological_process_innate_immunity: boolean (nullable = true)
 |-- keyword_biological_process_ion_transport: boolean (nullable = true)
 |-- keyword_biological_process_mitosis: boolean (nullable = true)
 |-- keyword

In [118]:
data_full.coalesce(1).write.csv('data/data_clean', header='true')