# Testing pymantic counting the Biomedical Nodes in WikiData

In [1]:
from pymantic import sparql
import pandas as pd

In [2]:
server = sparql.SPARQLServer('http://127.0.0.1:9999/bigdata/sparql')

In [3]:
def id_from_uri(uri):
    """
    Splits the URI for a property to give the ID 
    
    uri: String, the URI for the property
    return: String, the entity or property's ID (e.g. Q20747295 or P31)
    """
    return uri.split('/')[-1]

In [4]:
def query_to_df(result):
    """
    Takes the json result from a sparql query and converts to a Pandas DataFrame
    
    result: json, result from sparql query
    return: DataFrame, results in tabulated dataframe format
    """
    dat = result['results']['bindings']
    dat1 = []
    for d in dat:
        d = {k:v['value'] for k, v in d.items()}
        dat1.append(d)
    return pd.DataFrame(dat1)

## Counting Nodes

### Diseases

In [5]:
query = """
SELECT ?s ?sLabel ?p ?o
WHERE
{
  # Instance of Diseae
  ?s wdt:P31 wd:Q12136 .
  
  SERVICE wikibase:label { bd:serviceParam wikibase:language "en" }
}
"""

In [6]:
result = server.query(query)

# Print the Number of diseases
diseases = set(query_to_df(result)['s'].apply(id_from_uri))
len(diseases)

8779

#### Save disesase to a file

In [7]:
with open('data/diseases.txt', 'w') as fout:
    for disease in diseases:
        fout.write(disease+'\n')

### How many Genes are there?

There are multiple ways to count genes:

- Protein coding or all
- Just human vs all

We will look at:
- Human Protein Coding
- Human All
- All Taxons all Genes

In [8]:
# Human Protein Coding Genes
query = """
SELECT ?s ?sLabel ?p ?o
WHERE
{
  # Subclass of Protein-Coding Gene
  ?s wdt:P279 wd:Q20747295 .
  # Found in Taxon Homo Sapiens
  ?s wdt:P703 wd:Q15978631 .
  
  SERVICE wikibase:label { bd:serviceParam wikibase:language "en" }
}
"""

# Any Human Genes, Protein-coding or not
query1 = """
SELECT ?s ?sLabel ?p ?o
WHERE
{
  # Subclass of Gene
  ?s wdt:P279 wd:Q7187 .
  # Found in taxon homo sapiens
  ?s wdt:P703 wd:Q15978631 .
  
  SERVICE wikibase:label { bd:serviceParam wikibase:language "en" }
}
"""

# All Genes, all Taxons
query2 = """
SELECT ?s ?sLabel ?p ?o
WHERE
{
  # Subclass of Gene
  ?s wdt:P279 wd:Q7187 .
  
  SERVICE wikibase:label { bd:serviceParam wikibase:language "en" }
}
"""

Human Protein Coding Genes

In [9]:
result = server.query(query)

# Count Results
h_pc_genes = set(query_to_df(result)['s'].apply(id_from_uri))
len(h_pc_genes)

20706

In [10]:
# Save to a file
with open('data/h_pc_genes.txt', 'w') as fout:
    for gene in h_pc_genes:
        fout.write(gene+'\n')

All Human Genes

In [11]:
result = server.query(query1)

# Count Results
h_genes = set(query_to_df(result)['s'].apply(id_from_uri))
len(h_genes)

60162

In [12]:
# Save to a file
with open('data/h_genes.txt', 'w') as fout:
    for gene in h_genes:
        fout.write(gene+'\n')

All Genes, All Taxons!

In [13]:
result = server.query(query2)

# Count Results
all_genes = set(query_to_df(result)['s'].apply(id_from_uri))
len(all_genes)

617694

In [14]:
# Save to a file
with open('data/all_genes.txt', 'w') as fout:
    for gene in all_genes:
        fout.write(gene+'\n')

### Proteins

Wikidata treats genes and proteins differently. It may be good idea to get a feel for the 

In [21]:
query = """
SELECT distinct ?s ?sLabel ?p ?o ?oLabel
WHERE
{
    ?s wdt:P702 ?o . 
  	?o wdt:P279 wd:Q7187 .
  	?o wdt:P703 wd:Q15978631 .

    SERVICE wikibase:label { bd:serviceParam wikibase:language "en" }
}
"""

In [22]:
result = server.query(query)

# Count Results
h_proteins = set(query_to_df(result)['s'].apply(id_from_uri))
len(h_proteins)

26708

In [23]:
# Save to a file
with open('data/h_proteins.txt', 'w') as fout:
    for prot in h_proteins:
        fout.write(prot+'\n')

### What about chemical compounds?

In [15]:
query = """
SELECT ?s ?sLabel ?p ?o
WHERE
{
  # insance of Chemical Compound
  ?s wdt:P31 wd:Q11173 .
  SERVICE wikibase:label { bd:serviceParam wikibase:language "en" }
}
"""

In [16]:
result = server.query(query)

In [17]:
# Count the results
compounds = set(query_to_df(result)['s'].apply(id_from_uri))
len(compounds)

156683

In [18]:
# Save to a file
with open('data/compounds.txt', 'w') as fout:
    for comp in compounds:
        fout.write(comp+'\n')