# Get The N1 nodes for the network

In [1]:
import sys
sys.path.append('../py')
import sparql_tools as qt
import pandas as pd
from pymantic import sparql

server = sparql.SPARQLServer('http://127.0.0.1:9999/bigdata/sparql')

In [2]:
def query_node_type(query, n_type, filename):
    """
    Queries to find the id's for nodes of a given type, then save to a file and appends the type and filname to a dict.
    Queries are unique to every node type and must be written.
    
    :param query: String, the sqarql query to be run
    :param n_type: String, The name for the type of nodes being queried for
    :param filename: String, the name of the file to write the node id's to.
    """
    # Query for ids of a node type
    result = server.query(query)
    qids = set(qt.query_to_df(result)['s'].apply(qt.id_from_uri))
    
    # Write ids to a file
    with open(filename, 'w') as fout:
        for qid in qids:
            fout.write(qid+'\n')
        
    # Print some results to screen
    print('Wrote {} ids of type {} to file {}'.format(len(qids), n_type, filename))
    
    # Add info about type
    global node_files
    node_files.update({n_type: filename})

In [3]:
# Using types.remove ensured I didn't made a typeo in a type name
# And helped me to keep track of which I had completed when
# Initially writing this notebook
with open('data/remain_types.txt', 'r') as fin:
    types = fin.read().strip().split('\n')
types

['Supersecondary Structure',
 'Structural Motif',
 'Active Site',
 'Binding Site',
 'Post-translational Modification',
 'Chemical Hazard',
 'Symptom',
 'GO Term',
 'Protein Family',
 'Protein Domain',
 'Mature MicroRNA']

In [4]:
import json
with open('node_info.json', 'r') as fin:
    node_files = json.load(fin)
node_files

{'Compound': 'data/compounds.txt',
 'Disease': 'data/disease.txt',
 'Gene': 'data/other_genes.txt',
 'Non-coding RNA': 'data/ncr_genes.txt',
 'Protein': 'data/proteins.txt',
 'Protein-coding Gene': 'data/pc_genes.txt',
 'Pseudogene': 'data/pseu_genes.txt'}

## Query for GO Terms

In [5]:
query = """
SELECT DISTINCT ?s ?sLabel
WHERE
{
  # Molecular Function Terms
  {?a wdt:P680 ?s}
  # Biolocial Process Terms
  UNION {?a wdt:P682 ?s}
  # Cell Component Terms
  UNION {?a wdt:P681 ?s}
}
"""

In [6]:
ntype = types.pop(types.index('GO Term'))
filename = 'data/go_terms.txt'

query_node_type(query, ntype, filename)

Wrote 21358 ids of type GO Term to file data/go_terms.txt


## Mature Micronrna

In [7]:
query = """
SELECT DISTINCT ?s ?sLabel
WHERE
{
  # Instance of Mature MicroRNA
  ?s wdt:P31 wd:Q23838648  
}
"""

In [8]:
ntype = types.pop(types.index('Mature MicroRNA'))
filename = 'data/mirna.txt'

query_node_type(query, ntype, filename)

Wrote 2588 ids of type Mature MicroRNA to file data/mirna.txt


## Supersecondary Structure

In [9]:
query = """
SELECT DISTINCT ?s ?sLabel
WHERE
{
  # Instance of Supersecondary Structure
  ?s wdt:P31 wd:Q7644128  
}
"""

In [10]:
ntype = types.pop(types.index('Supersecondary Structure'))
filename = 'data/ss_struct.txt'

query_node_type(query, ntype, filename)

Wrote 686 ids of type Supersecondary Structure to file data/ss_struct.txt


## Structural Motif

In [11]:
query = """
SELECT DISTINCT ?s ?sLabel
WHERE
{
  # Instance of Structural Motif
  ?s wdt:P31 wd:Q3273544  
}
"""

In [12]:
ntype = types.pop(types.index('Structural Motif'))
filename = 'data/s_motif.txt'

query_node_type(query, ntype, filename)

Wrote 286 ids of type Structural Motif to file data/s_motif.txt


## Active Site

In [13]:
query = """
SELECT DISTINCT ?s ?sLabel
WHERE
{
  # Instance of Active Stie
  ?s wdt:P31 wd:Q423026  
}
"""

In [14]:
ntype = types.pop(types.index('Active Site'))
filename = 'data/active_site.txt'

query_node_type(query, ntype, filename)

Wrote 132 ids of type Active Site to file data/active_site.txt


## Binding Site

In [15]:
query = """
SELECT DISTINCT ?s ?sLabel
WHERE
{
  # Instance of Binding Stie
  ?s wdt:P31 wd:Q616005
}
"""

In [16]:
ntype = types.pop(types.index('Binding Site'))
filename = 'data/binding_site.txt'

query_node_type(query, ntype, filename)

Wrote 76 ids of type Binding Site to file data/binding_site.txt


## Post-Translational Modification

In [17]:
query = """
SELECT DISTINCT ?s ?sLabel
WHERE
{
  # Instance of Postranslational Modification
  ?s wdt:P31 wd:Q898362
  }
"""

In [18]:
ntype = types.pop(types.index('Post-translational Modification'))
filename = 'data/ptm.txt'

query_node_type(query, ntype, filename)

Wrote 18 ids of type Post-translational Modification to file data/ptm.txt


## Chemical Hazard

In [19]:
query = """
SELECT DISTINCT ?s ?sLabel
WHERE
{
  # Subclass of Chemical Hazard
  ?s wdt:P279 wd:Q21167512
  }
"""

In [20]:
ntype = types.pop(types.index('Chemical Hazard'))
filename = 'data/chem_hazard.txt'

query_node_type(query, ntype, filename)

Wrote 689 ids of type Chemical Hazard to file data/chem_hazard.txt


## Symptom

In [21]:
query = """
SELECT DISTINCT ?s ?sLabel
WHERE
{
  # Subclass of Symptom
  ?s wdt:P279 wd:Q169872
  }
"""

In [22]:
ntype = types.pop(types.index('Symptom'))
filename = 'data/symptoms.txt'

query_node_type(query, ntype, filename)

Wrote 38 ids of type Symptom to file data/symptoms.txt


## Protein Family

In [23]:
query = """
SELECT DISTINCT ?s ?sLabel
WHERE
{
  # Instance of Protein Family
  ?s wdt:P31 wd:Q417841
  }
"""

In [24]:
ntype = types.pop(types.index('Protein Family'))
filename = 'data/p_fams.txt'

query_node_type(query, ntype, filename)

Wrote 20390 ids of type Protein Family to file data/p_fams.txt


## Protein Domain

In [25]:
query = """
SELECT DISTINCT ?s ?sLabel
WHERE
{
  # Instance of Protein Domain
  ?s wdt:P31 wd:Q898273
  }
"""

In [26]:
ntype = types.pop(types.index('Protein Domain'))
filename = 'data/p_domain.txt'

query_node_type(query, ntype, filename)

Wrote 9056 ids of type Protein Domain to file data/p_domain.txt


## Dump info on the files

In [27]:
with open('node_info.json', 'w') as fout:
    json.dump(node_files, fout, indent=2)