# Testing pymantic counting the Biomedical Nodes in WikiData

In [1]:
from pymantic import sparql
import pandas as pd
import sys
sys.path.append('../py')
import sparql_tools as qt

In [2]:
server = sparql.SPARQLServer('http://127.0.0.1:9999/bigdata/sparql')

## Counting Nodes

### Diseases

In [3]:
query = """
SELECT ?s ?sLabel ?p ?o
WHERE
{
  # Instance of Diseae
  ?s wdt:P31 wd:Q12136 .
  
  SERVICE wikibase:label { bd:serviceParam wikibase:language "en" }
}
"""

In [4]:
result = server.query(query)

# Print the Number of diseases
diseases = set(qt.query_to_df(result)['s'].apply(qt.id_from_uri))
len(diseases)

8785

#### Save disesase to a file

In [5]:
with open('data/diseases.txt', 'w') as fout:
    for disease in diseases:
        fout.write(disease+'\n')

### How many Genes are there?

There are multiple ways to count genes:

- Protein coding or all
- Just human vs all

We will look at:
- Human Protein Coding
- Human All
- All Taxons all Genes

In [6]:
# Human Protein Coding Genes
query = """
SELECT ?s ?sLabel ?p ?o
WHERE
{
  # Subclass of Protein-Coding Gene
  ?s wdt:P279 wd:Q20747295 .
  # Found in Taxon Homo Sapiens
  ?s wdt:P703 wd:Q15978631 .
  
  SERVICE wikibase:label { bd:serviceParam wikibase:language "en" }
}
"""

# Any Human Genes, Protein-coding or not
query1 = """
SELECT ?s ?sLabel ?p ?o
WHERE
{
  # Subclass of Gene
  ?s wdt:P279 wd:Q7187 .
  # Found in taxon homo sapiens
  ?s wdt:P703 wd:Q15978631 .
  
  SERVICE wikibase:label { bd:serviceParam wikibase:language "en" }
}
"""

# All Genes, all Taxons
query2 = """
SELECT ?s ?sLabel ?p ?o
WHERE
{
  # Subclass of Gene
  ?s wdt:P279 wd:Q7187 .
  
  SERVICE wikibase:label { bd:serviceParam wikibase:language "en" }
}
"""

Human Protein Coding Genes

In [7]:
result = server.query(query)

# Count Results
h_pc_genes = set(qt.query_to_df(result)['s'].apply(qt.id_from_uri))
len(h_pc_genes)

20705

In [8]:
# Save to a file
with open('data/h_pc_genes.txt', 'w') as fout:
    for gene in h_pc_genes:
        fout.write(gene+'\n')

All Human Genes

In [9]:
result = server.query(query1)

# Count Results
h_genes = set(qt.query_to_df(result)['s'].apply(qt.id_from_uri))
len(h_genes)

60161

In [10]:
# Save to a file
with open('data/h_genes.txt', 'w') as fout:
    for gene in h_genes:
        fout.write(gene+'\n')

All Genes, All Taxons!

In [11]:
result = server.query(query2)

# Count Results
all_genes = set(qt.query_to_df(result)['s'].apply(qt.id_from_uri))
len(all_genes)

617693

In [12]:
# Save to a file
with open('data/all_genes.txt', 'w') as fout:
    for gene in all_genes:
        fout.write(gene+'\n')

### Proteins

Wikidata treats genes and proteins differently. It might be beneficial to bring in protein data as a base-nodes.

In [3]:
query = """
SELECT distinct ?s ?sLabel ?p ?o ?oLabel
WHERE
{
    #subclass of protein
    ?s wdt:P279 wd:Q8054 .
    #found in taxon homo sapiens
    ?s wdt:P703 wd:Q15978631 .

    SERVICE wikibase:label { bd:serviceParam wikibase:language "en" }
}
"""

In [4]:
result = server.query(query)

# Count Results
h_proteins = set(qt.query_to_df(result)['s'].apply(qt.id_from_uri))
len(h_proteins)

27567

In [15]:
# Save to a file
with open('data/h_proteins.txt', 'w') as fout:
    for prot in h_proteins:
        fout.write(prot+'\n')

### What about chemical compounds?

In [16]:
query = """
SELECT ?s ?sLabel ?p ?o
WHERE
{
  # insance of Chemical Compound
  ?s wdt:P31 wd:Q11173 .
  SERVICE wikibase:label { bd:serviceParam wikibase:language "en" }
}
"""

In [17]:
result = server.query(query)

In [18]:
# Count the results
compounds = set(qt.query_to_df(result)['s'].apply(qt.id_from_uri))
len(compounds)

156692

In [19]:
# Save to a file
with open('data/compounds.txt', 'w') as fout:
    for comp in compounds:
        fout.write(comp+'\n')

## Save the base nodes for future queries

We'll use human genes for the biomedical base nodes for now, along with proteins, compounds, and diseases

In [20]:
with open('data/basenodes.txt', 'w') as fout:
    for gene in h_genes:
        fout.write(gene+'\n')
    for prot in h_proteins:
        fout.write(prot+'\n')
    for disease in diseases:
        fout.write(disease+'\n')
    for comp in compounds:
        fout.write(comp+'\n')