# Edges from the base nodes of the Biomedical Dataspace of WikiData

In [1]:
from pymantic import sparql
import pandas as pd

server = sparql.SPARQLServer('http://127.0.0.1:9999/bigdata/sparql')

In [60]:
import xmltodict
import requests
import json
import os

# Create a dict to store property labels
# If this isnt the first run through, load the dict from
# a file
if os.path.exists('wd_properties.json'):
    with open('wd_properties.json', 'r') as fin:
        prop_dict = json.load(fin)
else:
    prop_dict = dict()
    

def get_prop_label(prop):
    """
    Function to get the property name from a property P-value
    
    prop: String, the wikidata property code (e.g. P31)
    return: String, the label for the given property (e.g. P31 reutrns 'instance of')
    """
    
    # If there's already a local properties file, 
    # Check to see if the property is alreay in the file
    if os.path.exists('wd_properties.json'):
        with open('wd_properties.json', 'r') as fin:
            prop_dict = json.load(fin)
        if prop in prop_dict:
            return prop_dict[prop]
    
    else:
        # request the info for the property
        r = requests.get('http://www.wikidata.org/prop/direct/' + prop)
        # Parse the xml
        rdict = xmltodict.parse(r.text)
        # Get the page title which is the property name
        title = rdict['html']['head']['title']
    
        # titles appear like this: 'Title name - Wikidata'
        # Just want the title name
        title = title[:title.find('-')-1]
    
    return title    

In [3]:
def id_from_uri(uri):
    """
    Splits the URI for a property to give the ID 
    
    uri: String, the URI for the property
    return: String, the entity or property's ID (e.g. Q20747295 or P31)
    """
    return uri.split('/')[-1]

In [4]:
def query_to_df(result):
    """
    Takes the json result from a sparql query and converts to a Pandas DataFrame
    
    result: json, result from sparql query
    return: DataFrame, results in tabulated dataframe format
    """
    dat = result['results']['bindings']
    dat1 = []
    for d in dat:
        d = {k:v['value'] for k, v in d.items()}
        dat1.append(d)
    return pd.DataFrame(dat1)

## Diseases

In [28]:
query = """
SELECT distinct ?disease ?diseaseLabel ?p ?o ?oLabel
WHERE
{
    # disease is instace of disease
    ?disease wdt:P31 wd:Q12136 .
    # Get edges and targets
    ?disease ?p ?o .
    FILTER NOT EXISTS {?o rdf:type ?type .}
    # Make sure edges ave there own edges and targes
    ?o ?p2 ?o2 .
    FILTER NOT EXISTS {?o2 rdf:type ?type .}
    SERVICE wikibase:label { bd:serviceParam wikibase:language "en" }
}"""

In [38]:
%time result = server.query(query)

CPU times: user 292 ms, sys: 1.48 s, total: 1.78 s
Wall time: 1min 43s


In [None]:
disease_df = query_to_df(result)

Get Human Readable labels for the Properties

In [40]:
props = set(disease_df['p'].apply(id_from_uri))
prop_dict.update({prop: get_prop_label(prop) for prop in props})

disease_df['pLabel'] = disease_df['p'].apply(lambda x: prop_dict[id_from_uri(x)])

In [41]:
len(disease_df)

29606

In [110]:
test = disease_df[disease_df.duplicated(subset=['diseaseLabel', 'pLabel', 'oLabel'], keep=False)]


In [111]:
test.loc[1329]


disease         http://www.wikidata.org/entity/Q18554674
diseaseLabel                            cerebral malaria
o                  http://www.wikidata.org/entity/Q12156
oLabel                                           malaria
p               http://www.wikidata.org/prop/direct/P279
pLabel                                       subclass of
Name: 1329, dtype: object

In [112]:
test.loc[1361]

disease          http://www.wikidata.org/entity/Q2822915
diseaseLabel                            cerebral malaria
o                  http://www.wikidata.org/entity/Q12156
oLabel                                           malaria
p               http://www.wikidata.org/prop/direct/P279
pLabel                                       subclass of
Name: 1361, dtype: object

In [42]:
disease_df[['diseaseLabel', 'pLabel', 'oLabel']].head(20)

Unnamed: 0,diseaseLabel,pLabel,oLabel
0,chronic cystitis,subclass of,cystitis
1,Alzheimer's disease 2,subclass of,Alzheimer's disease
2,Alzheimer's disease 4,subclass of,Alzheimer's disease
3,Alzheimer's disease 8,subclass of,Alzheimer's disease
4,Alzheimer's disease 7,subclass of,Alzheimer's disease
5,Alzheimer's disease 19,subclass of,Alzheimer's disease
6,hemorrhagic cystitis,subclass of,cystitis
7,Clay-shoveler fracture,subclass of,spinal fracture
8,acute cystitis,subclass of,cystitis
9,Alzheimer's disease 6,subclass of,Alzheimer's disease


In [43]:
disease_df['pLabel'].value_counts()[:20]

subclass of                11169
instance of                 8818
drug used for treatment     2923
medical specialty           2211
genetic association         2182
has cause                    398
topic's main category        359
symptoms                     303
described by source          177
afflicts                     176
named after                  166
anatomical location          158
medical examinations          68
cause of                      58
discoverer or inventor        51
equivalent class              48
Disease Ontology ID           39
said to be the same as        32
IC                            31
different from                29
Name: pLabel, dtype: int64

In [69]:
# Save to disk
disease_df.to_hdf('data/disease_edges.h5', 'disase_df')

## Genes

There are too many genes for a single query, doing this results in a MemoryManagerOutOfMemory error.  
Therefore, we will loop through the list of genes, doing each query independently.

In [51]:
query = """
SELECT distinct ?gene ?geneLabel ?p ?o ?oLabel
WHERE
{{
    values ?gene {{wd:{}}}
    # Get edges and targets
    ?gene ?p ?o .
    FILTER NOT EXISTS {{?o rdf:type ?type .}}
    # Make sure edges ave there own edges and targes
    ?o ?p2 ?o2 .
    FILTER NOT EXISTS {{?o2 rdf:type ?type .}}
    SERVICE wikibase:label {{ bd:serviceParam wikibase:language "en" }}
}}"""

In [48]:
def open_datafile(filename):
    """
    Opens a file and returns the data as a list
    
    filename: String, the name of the file
    return: List, the list of the items in the datafile
    """
    out_list = []
    with open('data/'+filename, 'r') as fin:
        for line in fin.readlines():
            out_list.append(line.strip())
    return out_list

In [49]:
h_genes = open_datafile('h_genes.txt')

In [53]:
from tqdm import tqdm

Query each gene to get edges and nodes they point to

In [54]:
results = []
for gene in tqdm(h_genes):
    results.append(server.query(query.format(gene)))

100%|██████████| 60162/60162 [24:18<00:00, 29.91it/s]


Convert the results to a DataFrame and concatanate the results

In [55]:
results_dfs = []
for result in tqdm(results):
    results_dfs.append(query_to_df(result))

100%|██████████| 60162/60162 [00:54<00:00, 1110.15it/s]


In [65]:
gene_df = pd.concat(results_dfs)
gene_df = gene_df.reset_index(drop=True)
len(gene_df)

231966

In [61]:
props = set(gene_df['p'].apply(id_from_uri))
prop_dict.update({prop: get_prop_label(prop) for prop in props})

gene_df['pLabel'] = gene_df['p'].apply(lambda x: prop_dict[id_from_uri(x)])

In [66]:
gene_df[['geneLabel', 'pLabel', 'oLabel']].head(20)

Unnamed: 0,geneLabel,pLabel,oLabel
0,DYZ1L27,subclass of,gene
1,MAPK12,chromosome,Homo sapiens chromosome 22
2,MAPK12,ortholog,Mapk12
3,MAPK12,subclass of,protein-coding gene
4,MAPK12,encodes,Mitogen-activated protein kinase 12
5,MAPK12,ortholog,Mapk12
6,MAPK12,strand orientation,Reverse Strand
7,MAPK12,subclass of,gene
8,OOEP-AS1,subclass of,gene
9,OOEP-AS1,subclass of,non-coding RNA


In [67]:
gene_df['pLabel'].value_counts()

subclass of                                        116880
ortholog                                            35295
encodes                                             27194
chromosome                                          25673
strand orientation                                  24631
genetic association                                  2205
instance of                                            48
has part                                                6
described by source                                     5
decreased expression in                                 4
different from                                          3
medical condition treated                               3
deletion association with                               2
RefSeq RNA ID                                           2
topic's main category                                   2
encoded by                                              1
cause of                                                1
increased expr

In [68]:
# Save to disk
gene_df.to_hdf('data/h_genes_edges.h5', 'gene_df')

## Proteins

Genes did not seem to have very many interesting edges... lets see if proteins have anything better

In [116]:
query = """
SELECT distinct ?prot ?protLabel ?p ?o ?oLabel
WHERE
{{
    values ?prot {{wd:{}}}
    # Get edges and targets
    ?prot ?p ?o .
    FILTER NOT EXISTS {{?o rdf:type ?type .}}
    # Make sure edges ave there own edges and targes
    ?o ?p2 ?o2 .
    FILTER NOT EXISTS {{?o2 rdf:type ?type .}}
    SERVICE wikibase:label {{ bd:serviceParam wikibase:language "en" }}
}}"""

In [117]:
h_prots = open_datafile('h_proteins.txt')

In [120]:
results = []
for prot in tqdm(h_prots):
    results.append(server.query(query.format(prot)))

100%|██████████| 26708/26708 [27:28<00:00, 16.20it/s]


In [121]:
results_dfs = []
for result in tqdm(results):
    results_dfs.append(query_to_df(result))

100%|██████████| 26708/26708 [00:24<00:00, 1081.30it/s]


In [122]:
prot_df = pd.concat(results_dfs)
prot_df = prot_df.reset_index(drop=True)
len(prot_df)

468107

In [124]:
props = set(prot_df['p'].apply(id_from_uri))
prop_dict.update({prop: get_prop_label(prop) for prop in props})

prot_df['pLabel'] = prot_df['p'].apply(lambda x: prop_dict[id_from_uri(x)])

In [125]:
prot_df[['protLabel', 'pLabel', 'oLabel']].head(20)

Unnamed: 0,protLabel,pLabel,oLabel
0,Putative uncharacterized protein ATP1A1-AS1,subclass of,protein
1,Putative uncharacterized protein ATP1A1-AS1,encoded by,ATP1A1-AS1
2,C2 calcium dependent domain containing 2,cell component,extracellular region
3,C2 calcium dependent domain containing 2,encoded by,C2CD2
4,C2 calcium dependent domain containing 2,has part,C2 domain
5,C2 calcium dependent domain containing 2,subclass of,protein
6,C2 calcium dependent domain containing 2,cell component,nucleus
7,C2 calcium dependent domain containing 2,cell component,cytosol
8,Protein phosphatase with EF-hand domain 2,molecular function,protein serine/threonine phosphatase activity
9,Protein phosphatase with EF-hand domain 2,has part,EF-hand domain pair


In [126]:
prot_df['pLabel'].value_counts()[:20]

biological process           153068
cell component                96791
molecular function            81450
has part                      61545
subclass of                   39483
encoded by                    27007
physically interacts with      3696
instance of                    1705
chromosome                     1666
strand orientation             1666
medical condition treated         4
topic's main category             3
described by source               3
ortholog                          3
part of                           3
different from                    2
genetic association               2
material used                     2
encodes                           2
material produced                 2
Name: pLabel, dtype: int64

In [127]:
# Save to disk
prot_df.to_hdf('data/h_protein_edges.h5', 'prot_df')

## Compounds

In [70]:
query = """
SELECT distinct ?comp ?compLabel ?p ?o ?oLabel
WHERE
{{
    values ?comp {{wd:{}}}
    # Get edges and targets
    ?comp ?p ?o .
    FILTER NOT EXISTS {{?o rdf:type ?type .}}
    # Make sure edges have their own edges and targes
    ?o ?p2 ?o2 .
    FILTER NOT EXISTS {{?o2 rdf:type ?type .}}
    SERVICE wikibase:label {{ bd:serviceParam wikibase:language "en" }}
}}"""

In [71]:
compounds = open_datafile('compounds.txt')

In [72]:
results = []
for comp in tqdm(compounds):
    results.append(server.query(query.format(comp)))

100%|██████████| 156683/156683 [52:00<00:00, 50.22it/s] 


In [73]:
results_dfs = []
for result in tqdm(results):
    results_dfs.append(query_to_df(result))

100%|██████████| 156683/156683 [02:17<00:00, 1142.58it/s]


In [74]:
comp_df = pd.concat(results_dfs)
comp_df = comp_df.reset_index(drop=True)
len(comp_df)

189127

In [75]:
props = set(comp_df['p'].apply(id_from_uri))
prop_dict.update({prop: get_prop_label(prop) for prop in props})

comp_df['pLabel'] = comp_df['p'].apply(lambda x: prop_dict[id_from_uri(x)])

In [76]:
comp_df[['compLabel', 'pLabel', 'oLabel']].head(20)

Unnamed: 0,compLabel,pLabel,oLabel
0,N-[4-[4-(ethylsulfonylamino)-3-methoxyphenyl]-...,instance of,chemical compound
1,daledalin tosylate,instance of,chemical compound
2,Enestebol,instance of,chemical compound
3,"4-(dimethylamino)-N-[(3R,9R,10S)-12-[(2R)-1-hy...",instance of,chemical compound
4,"2-pyridin-2-yl-7,8-dihydro-5H-thiopyrano[4,3-D...",instance of,chemical compound
5,benzyl methyl disulfide,instance of,chemical compound
6,"9-(3-hydroxy-2,6,6-trimethyl-1-cyclohexenyl)-3...",instance of,chemical compound
7,"N-(3-methoxypropyl)-3,4,5-trimethoxybenzylamine",instance of,chemical compound
8,encenicline,physically interacts with,Cholinergic receptor nicotinic alpha 7 subunit
9,encenicline,instance of,chemical compound


In [82]:
comp_df['pLabel'].value_counts()[:20]

instance of                     162254
has part                         13928
physically interacts with         3909
medical condition treated         3557
significant drug interaction      1799
subclass of                        963
cause of                           613
topic's main category              412
described by source                359
crystal system                     304
color                              293
route of administration            197
discoverer or inventor              65
named after                         58
different from                      40
biological process                  40
part of                             38
use                                 36
material used                       36
decays to                           31
Name: pLabel, dtype: int64

In [78]:
# Save to disk
comp_df.to_hdf('data/compund_edges.h5', 'comp_df')

## Summary so far

### Nodes

- Compound Nodes: 156,683
- Gene Nodes (Homo sapiens):
    - Protein Coding: 20,706
    - Any: 60,162
    - All genes & taxons: 617,694
- Disease Nodes: 8,779

### Edges

- From Compounds: 189,022
- From Genes (Any Human): 231,966
- From Diseases: 162,024


## Save properties for future use

In [79]:
with open('wd_properties.json', 'r') as fin:
    prop_dict1 = json.load(fin)
# Compare what new properties we've looked up to those already in the file
len(prop_dict), len(prop_dict1)

(694, 694)

In [80]:
# See if there's new ones that can be added
prop_dict1.update(prop_dict)
len(prop_dict1)

694

In [81]:
# Save with new properties to file
with open('wd_properties.json', 'w') as fout:
    json.dump(prop_dict1, fout, indent=2)

## Save the entity labels for future use

Supposedly the line `SERVICE wikibase:label {{ bd:serviceParam wikibase:language "en" }}` in queryies is expensive
so we'll save the labels so they can be mapped later.

In [146]:
if os.path.exists('wd_entities.json'):
    with open('wd_entities.json', 'r') as fin:
        entity_dict = json.load(fin)
else:
    entity_dict = {}

In [147]:
def add_to_entity_dict(entity_dict, df, eid, label):
    """
    Updates the entitiy dict to add new enity:label  key:value pairs from a wikidata search
    """
    cols = [eid, label]
    entity_dict.update({id_from_uri(item[eid]): item[label] for item in df[cols].to_dict(orient='records')})

In [148]:
# Add diseases
add_to_entity_dict(entity_dict, disease_df, 'disease', 'diseaseLabel')
add_to_entity_dict(entity_dict, disease_df, 'o', 'oLabel')
# Add Genes
add_to_entity_dict(entity_dict, gene_df, 'gene', 'geneLabel')
add_to_entity_dict(entity_dict, gene_df, 'o', 'oLabel')
# Add Proteins
add_to_entity_dict(entity_dict, prot_df, 'prot', 'protLabel')
add_to_entity_dict(entity_dict, prot_df, 'o', 'oLabel')
# Add Compounds
add_to_entity_dict(entity_dict, comp_df, 'comp', 'compLabel')
add_to_entity_dict(entity_dict, comp_df, 'o', 'oLabel')

In [149]:
len(entity_dict)

323441

In [150]:
with open('wd_entities.json', 'w') as fout:
    json.dump(entity_dict, fout, indent=2)