# Testing pymantic counting the Biomedical Nodes in WikiData

In [1]:
from pymantic import sparql
import pandas as pd
import sys
sys.path.append('../py')
import sparql_tools as qt

In [2]:
server = sparql.SPARQLServer('http://127.0.0.1:9999/bigdata/sparql')

## Counting Nodes

### Diseases

In [3]:
query = """
SELECT ?s ?sLabel ?p ?o
WHERE
{
  # Instance of Diseae
  ?s wdt:P31 wd:Q12136 .
  
  SERVICE wikibase:label { bd:serviceParam wikibase:language "en" }
}
"""

In [4]:
result = server.query(query)

# Print the Number of diseases
diseases = set(qt.query_to_df(result)['s'].apply(qt.id_from_uri))
len(diseases)

8666

#### Save disesase to a file

In [5]:
with open('data/diseases.txt', 'w') as fout:
    for disease in diseases:
        fout.write(disease+'\n')

### How many Genes are there?

There are multiple subtypes of human genes:

- 'Protein-coding Gene'
- 'Non-coding RNA'
- 'Pseudogene'
- 'Gene' <- Catchall for those not in the other 3


In [6]:
# Human Protein Coding Genes
query = """
SELECT ?s ?sLabel
WHERE
{
  # Subclass of Protein-Coding Gene
  ?s wdt:P279 wd:Q20747295 .
  # Found in Taxon Homo Sapiens
  ?s wdt:P703 wd:Q15978631 .
  
  SERVICE wikibase:label { bd:serviceParam wikibase:language "en" }
}
"""

# Non-coding RNA
query1 = """
SELECT ?s ?sLabel
WHERE
{
  # Subclass of non-coding RNA
  ?s wdt:P279 wd:Q427087 .
  # Found in Taxon Homo Sapiens
  ?s wdt:P703 wd:Q15978631 .
  
  SERVICE wikibase:label { bd:serviceParam wikibase:language "en" }
}
"""

# Pseudogene
query2 = """
SELECT ?s ?sLabel
WHERE
{
  # Subclass of Pseudogene
  ?s wdt:P279 wd:Q277338 .
  # Found in Taxon Homo Sapiens
  ?s wdt:P703 wd:Q15978631 .
  
  SERVICE wikibase:label { bd:serviceParam wikibase:language "en" }
}
"""

# Remaining Genes (not many)
query3 = """
SELECT ?s ?sLabel
WHERE
{
  # Subclass of Gene
  ?s wdt:P279 wd:Q7187 .
  # Found in Taxon Homo Sapiens
  ?s wdt:P703 wd:Q15978631 .
  
  # Subtract out the other gene types
  MINUS {?s wdt:P279 wd:Q20747295}
  MINUS {?s wdt:P279 wd:Q277338}
  MINUS {?s wdt:P279 wd:Q427087}
  
  SERVICE wikibase:label { bd:serviceParam wikibase:language "en" }
}
"""


#### Human Protein Coding Genes

In [7]:
result = server.query(query)

# Count Results
h_pc_genes = set(qt.query_to_df(result)['s'].apply(qt.id_from_uri))
len(h_pc_genes)

20706

In [8]:
# Save to a file
with open('data/pc_genes.txt', 'w') as fout:
    for gene in h_pc_genes:
        fout.write(gene+'\n')

#### Non-coding RNA

In [11]:
result = server.query(query1)

# Count Results
ncr_genes = set(qt.query_to_df(result)['s'].apply(qt.id_from_uri))
len(ncr_genes)

18066

In [12]:
# Save to a file
with open('data/ncr_genes.txt', 'w') as fout:
    for gene in ncr_genes:
        fout.write(gene+'\n')

#### Pseudogenes

In [13]:
result = server.query(query2)

# Count Results
pseu_genes = set(qt.query_to_df(result)['s'].apply(qt.id_from_uri))
len(pseu_genes)

16862

In [14]:
# Save to a file
with open('data/pseu_genes.txt', 'w') as fout:
    for gene in pseu_genes:
        fout.write(gene+'\n')

#### Other Genes

In [15]:
result = server.query(query3)

# Count Results
other_genes = set(qt.query_to_df(result)['s'].apply(qt.id_from_uri))
len(other_genes)

4533

In [16]:
# Save to a file
with open('data/other_genes.txt', 'w') as fout:
    for gene in other_genes:
        fout.write(gene+'\n')

### Proteins

Wikidata treats genes and proteins differently. 

In [17]:
query = """
SELECT distinct ?s ?sLabel ?p ?o ?oLabel
WHERE
{
    #subclass of protein
    ?s wdt:P279 wd:Q8054 .
    #found in taxon homo sapiens
    ?s wdt:P703 wd:Q15978631 .

    SERVICE wikibase:label { bd:serviceParam wikibase:language "en" }
}
"""

In [18]:
result = server.query(query)

# Count Results
h_proteins = set(qt.query_to_df(result)['s'].apply(qt.id_from_uri))
len(h_proteins)

27566

In [19]:
# Save to a file
with open('data/proteins.txt', 'w') as fout:
    for prot in h_proteins:
        fout.write(prot+'\n')

### What about chemical compounds?

In [20]:
query = """
SELECT ?s ?sLabel ?p ?o
WHERE
{
  # insance of Chemical Compound
  ?s wdt:P31 wd:Q11173 .
  SERVICE wikibase:label { bd:serviceParam wikibase:language "en" }
}
"""

In [21]:
result = server.query(query)

In [22]:
# Count the results
compounds = set(qt.query_to_df(result)['s'].apply(qt.id_from_uri))
len(compounds)

156634

In [23]:
# Save to a file
with open('data/compounds.txt', 'w') as fout:
    for comp in compounds:
        fout.write(comp+'\n')