** Build Adjacency Matrix **

In [1]:
import sqlite3
import json

In [2]:
# Progress Bar I found on the internet.
# https://github.com/alexanderkuk/log-progress
from progress_bar import log_progress

In [3]:
PLOS_PMC_DB = 'sqlite_data/data.plos-pmc.sqlite'
ALL_DB = 'sqlite_data/data.all.sqlite'

PLOS_PMC_JSON_FOLDER = 'json_data/plos-pmc/'
ALL_JSON_FOLDER = 'json_data/all/'



JSON_FILES = [
    {
        "var_name" : "AUTOCOMPLETE_SYMBOL",
        "type":"gene",
        "filename": "gene_id.json",
    },
    {  
        "var_name" : "AUTOCOMPLETE_DISEASE",
        "type":"disease",
        "filename": "disease_id.json",
    },
    {
        "var_name" : "AUTOCOMPLETE_CHEMICAL",
        "type":"chemical",
        "filename": "chemical_id.json",
    },
#     {
#         "var_name" : "AUTOCOMPLETE_SYMBOL_SET",
#         "type":"symbol",
#         "filename": "symbol_id.json",
#     },
]

WRITE = True


In [4]:
conn_plos_pmc = sqlite3.connect(PLOS_PMC_DB)
cursor_plos_pmc = conn_plos_pmc.cursor()

conn_all = sqlite3.connect(ALL_DB)
cursor_all = conn_all.cursor()

In [5]:
actions = [
    {
        "name" : "PLOS-PMC",
        "db":PLOS_PMC_DB,
        "json_folder":PLOS_PMC_JSON_FOLDER,
        "conn": conn_plos_pmc,
        "cursor": cursor_plos_pmc,
    },
        {
        "name" : "All",
        "db":ALL_DB,
        "json_folder":ALL_JSON_FOLDER,
        "conn": conn_all,
        "cursor": cursor_all,
    },
]

Queries

In [6]:
# For getting the maximum row id
QUERY_MAX_ID = "SELECT id FROM interactions ORDER BY id DESC LIMIT 1"

# Get interaction data
QUERY_INTERACTION = "SELECT distinct geneids1, mention1, geneids2, mention2 FROM interactions WHERE id = ?"
NEW_QUERY = "SELECT geneids1, mention1 from interactions union select geneids2, mention2 from interactions"

Step through every interaction.

1. If geneids1 not in matrix - insert it as dict.
2. If geneids2 not in matrix[geneids1] - insert it as []
3. If probability not in matrix[geneids1][geneids2] - insert it.
4. Perform the reverse.

In [7]:
def isGene( id):
    return not isDrug(id) and not isDisease(id)

def isDrug( id):
    return len(id) > 0 and id[0] == 'C' 

def isDisease( id ):
    return len(id) > 0 and id[0] == 'D'

    

In [8]:
for action in log_progress(actions, every=1, name="Total Datasets generated"):
    print("Executing SQL query. May take a minute.")
    cursor = action["cursor"].execute(NEW_QUERY)
    interactions = cursor.fetchall()
    print("Query complete")
    for json_file in log_progress(JSON_FILES, every=1, name=action["name"]+" JSON files generated"):
        check = None

        if(json_file["type"] == "gene"):
            check = isGene
        elif(json_file["type"] == "disease"):
            check = isDisease
        elif(json_file["type"] == "chemical"):
            check = isDrug
        else:
            raise ValueError('{type} is an unrecognized type in actions["{action}"]'.format(type = json_file["type"]), action = action)
        
        typeahead = {}
        final = []
        distribution = {}
        row_id = 0
                
        for row in log_progress(interactions, every=1000, name=action["json_folder"] + json_file["filename"]+" Progess"):
            if row == None:
                continue
                


            id1 = row[0]
            symbol1 = row[1]

            if check(id1):
                if symbol1 not in typeahead:
                    typeahead[symbol1] = []
                if id1 not in typeahead[symbol1]:
                    typeahead[symbol1].append(id1)
                
        
        if WRITE:
            for key in typeahead:
                final.append( {"symbol": key, "values": typeahead[key]} )
            with open(action["json_folder"] + json_file["filename"], "w+") as file:
                file.write("let " + json_file["var_name"]+" = "+json.dumps( final ))

VBox(children=(HTML(value=''), IntProgress(value=0, max=2)))

Executing SQL query. May take a minute.
Query complete


VBox(children=(HTML(value=''), IntProgress(value=0, max=3)))

VBox(children=(HTML(value=''), IntProgress(value=0, max=35276)))

VBox(children=(HTML(value=''), IntProgress(value=0, max=35276)))

VBox(children=(HTML(value=''), IntProgress(value=0, max=35276)))

Executing SQL query. May take a minute.
Query complete


VBox(children=(HTML(value=''), IntProgress(value=0, max=3)))

VBox(children=(HTML(value=''), IntProgress(value=0, max=43692)))

VBox(children=(HTML(value=''), IntProgress(value=0, max=43692)))

VBox(children=(HTML(value=''), IntProgress(value=0, max=43692)))