# Retreving Base Biomedical Nodes in WikiData

In [1]:
from pymantic import sparql
import pandas as pd
import sys
sys.path.append('../py')
import sparql_tools as qt

In [2]:
server = sparql.SPARQLServer('http://127.0.0.1:9999/bigdata/sparql')

In [3]:
with open('data/n1_types.txt', 'r') as fin:
    types = fin.read().strip().split('\n')
types = [' '.join([w[0].upper() + w[1:] for w in t.split(' ')]) for t in types]

In [4]:
node_files = dict()

In [5]:
def query_node_type(query, n_type, filename):
    # Query for ids of a node type
    result = server.query(query)
    qids = set(qt.query_to_df(result)['s'].apply(qt.id_from_uri))
    
    # Write ids to a file
    with open(filename, 'w') as fout:
        for qid in qids:
            fout.write(qid+'\n')
        
    # Print some results to screen
    print('Wrote {} ids of type {} to file {}'.format(len(qids), n_type, filename))
    
    # Add info about type
    global node_files
    node_files.update({n_type: filename})


## Counting Nodes

### Diseases

In [6]:
query = """
SELECT ?s ?sLabel ?p ?o
WHERE
{
  # Instance of Diseae
  ?s wdt:P31 wd:Q12136 .
  
  SERVICE wikibase:label { bd:serviceParam wikibase:language "en" }
}
"""

In [7]:
ntype = types.pop(types.index('Disease'))
filename = 'data/disease.txt'

query_node_type(query, ntype, filename)

Wrote 8666 ids of type Disease to file data/disease.txt


### What kind of Genes are there?

There are multiple subtypes of human genes:

- 'Protein-coding Gene'
- 'Non-coding RNA'
- 'Pseudogene'
- 'Gene' <- Catchall for those not in the other 3


In [8]:
# Human Protein Coding Genes
query = """
SELECT ?s ?sLabel
WHERE
{
  # Subclass of Protein-Coding Gene
  ?s wdt:P279 wd:Q20747295 .
  # Found in Taxon Homo Sapiens
  ?s wdt:P703 wd:Q15978631 .
  
}
"""

# Non-coding RNA
query1 = """
SELECT ?s ?sLabel
WHERE
{
  # Subclass of non-coding RNA
  ?s wdt:P279 wd:Q427087 .
  # Found in Taxon Homo Sapiens
  ?s wdt:P703 wd:Q15978631 .
 
}
"""

# Pseudogene
query2 = """
SELECT ?s ?sLabel
WHERE
{
  # Subclass of Pseudogene
  ?s wdt:P279 wd:Q277338 .
  # Found in Taxon Homo Sapiens
  ?s wdt:P703 wd:Q15978631 .
}
"""

# Remaining Genes (not many)
query3 = """
SELECT ?s ?sLabel
WHERE
{
  # Subclass of Gene
  ?s wdt:P279 wd:Q7187 .
  # Found in Taxon Homo Sapiens
  ?s wdt:P703 wd:Q15978631 .
  
  # Subtract out the other gene types
  MINUS {?s wdt:P279 wd:Q20747295}
  MINUS {?s wdt:P279 wd:Q277338}
  MINUS {?s wdt:P279 wd:Q427087} 
}
"""


#### Human Protein Coding Genes

In [9]:
ntype = types.pop(types.index('Protein-coding Gene'))
filename = 'data/pc_genes.txt'

query_node_type(query, ntype, filename)

Wrote 20706 ids of type Protein-coding Gene to file data/pc_genes.txt


#### Non-coding RNA

In [10]:
ntype = types.pop(types.index('Non-coding RNA'))
filename = 'data/ncr_genes.txt'

query_node_type(query1, ntype, filename)

Wrote 18066 ids of type Non-coding RNA to file data/ncr_genes.txt


#### Pseudogenes

In [11]:
ntype = types.pop(types.index('Pseudogene'))
filename = 'data/pseu_genes.txt'

query_node_type(query2, ntype, filename)

Wrote 16862 ids of type Pseudogene to file data/pseu_genes.txt


#### Other Genes

In [12]:
ntype = types.pop(types.index('Gene'))
filename = 'data/other_genes.txt'

query_node_type(query3, ntype, filename)

Wrote 4533 ids of type Gene to file data/other_genes.txt


### Proteins

Wikidata treats genes and proteins differently. 

In [13]:
query = """
SELECT distinct ?s ?sLabel ?p ?o ?oLabel
WHERE
{
    #subclass of protein
    ?s wdt:P279 wd:Q8054 .
    #found in taxon homo sapiens
    ?s wdt:P703 wd:Q15978631 .

}
"""

In [14]:
ntype = types.pop(types.index('Protein'))
filename = 'data/proteins.txt'

query_node_type(query, ntype, filename)

Wrote 27566 ids of type Protein to file data/proteins.txt


### What about chemical compounds?

In [15]:
query = """
SELECT ?s ?sLabel ?p ?o
WHERE
{
  # insance of Chemical Compound
  ?s wdt:P31 wd:Q11173 .
}
"""

In [16]:
ntype = types.pop(types.index('Compound'))
filename = 'data/compounds.txt'

query_node_type(query, ntype, filename)

Wrote 156634 ids of type Compound to file data/compounds.txt


## Save some information about the remaining types

In [17]:
import json

In [18]:
with open('node_info.json', 'w') as fout:
    json.dump(node_files, fout, indent=2)

In [19]:
with open('data/remain_types.txt', 'w') as fout:
    for t in types:
        fout.write(t+'\n')