# Alternative ID Generation Script

This will produce all the various IDs that can be used to lookup in different databases.

In [1]:
# Progress Bar I found on the internet.
# https://github.com/alexanderkuk/log-progress
from progress_bar import log_progress
import re
import sqlite3
import json
from shutil import copy2

In [2]:
WRITE = True
DROP_AND_REMAKE = True

# TSV files containing Pharm-GKB data
PHARMGKB_INTERACTIONS_FILE          = 'tsv_data/pharmgkb/relationships.tsv'
PHARMGKB_CHEMICAL_IDS_FILE          = 'tsv_data/pharmgkb/ids/chemicals.tsv'
PHARMGKB_DRUGS_IDS_FILE             = 'tsv_data/pharmgkb/ids/drugs.tsv'
PHARMGKB_GENES_IDS_FILE             = 'tsv_data/pharmgkb/ids/genes.tsv'
PHARMGKB_PHENOTYPES_IDS_FILE        = 'tsv_data/pharmgkb/ids/phenotypes.tsv'

PHARMGKB_ID_FILES = [
     {"filename":PHARMGKB_CHEMICAL_IDS_FILE},
     {"filename":PHARMGKB_DRUGS_IDS_FILE},
    {"filename":PHARMGKB_GENES_IDS_FILE},
     {"filename":PHARMGKB_PHENOTYPES_IDS_FILE},
]

TABLE_NAME = "alternative_ids"
MESH_VALUE = "mesh"
PHARMGKB_VALUE = "pgkb"
NCBI_VALUE = "ncbi"
VALUES_COLUMN = "vals"
TYPE_COLUMN = "type"
DROP_SQL = 'DROP TABLE IF EXISTS {table};'.format(table = TABLE_NAME)
SCHEMA = '''CREATE TABLE IF NOT EXISTS {table} (
            {col1} VARCHAR UNIQUE,
            {col2} VARCHAR UNIQUE,
            {col3} VARCHAR UNIQUE,
            {col4} VARCHAR,
            {col5} VARCHAR,
            PRIMARY KEY ({col1}, {col2}, {col3})
            );'''.format(
            table = TABLE_NAME, 
            col1=MESH_VALUE,
            col2=PHARMGKB_VALUE,
            col3=NCBI_VALUE,
            col4=VALUES_COLUMN,
            col5=TYPE_COLUMN)
QUERY_UNIQUE_IDS = '''  SELECT DISTINCT geneids1,
                        {type_col}1 FROM interactions
                            union
                        SELECT DISTINCT geneids2,
                        {type_col}2 FROM interactions;'''.format(type_col = TYPE_COLUMN)

PLOS_PMC_DB = 'sqlite_data/data.plos-pmc.sqlite'
ALL_DB = 'sqlite_data/data.all.sqlite'



DELIMITER = "\t"


### Connect to Databases

In [3]:
conn_all = sqlite3.connect(ALL_DB)
cursor_all = conn_all.cursor()

### Create table

In [4]:
if DROP_AND_REMAKE:
    cursor_all.execute(DROP_SQL)
cursor_all.execute(SCHEMA)

    

<sqlite3.Cursor at 0x27add1ecf80>

### Gather Unique IDs into Hash Map
The index is the id, the value is an object.

In [5]:
data = {}

print("Executing SQL query. May take a minute.")
cursor = cursor_all.execute(QUERY_UNIQUE_IDS)
interactions = cursor.fetchall()
print("Query complete")

for row in log_progress(interactions, every=100, name="IDs added"):

    if row == None:
        continue

    
    id = row[0]
    dgr_type = row[1]

    if id not in data:
        data[id] = { TYPE_COLUMN : dgr_type}
        
print("Unique IDs added to data")

Executing SQL query. May take a minute.
Query complete


VBox(children=(HTML(value=''), IntProgress(value=0, max=25545)))

Unique IDs added to data


## Create lookup table for other values

In [6]:
id_map = {}

for id_file in PHARMGKB_ID_FILES:
    with open(id_file["filename"]) as file:
        try:
            header = None
            linenum = 0
            for line in file:
                linenum+=1
                pgkb = None
                ncbi = None
                mesh = None

                # deliminate the lines
                line = line.strip().split(DELIMITER)

                # Read the headers of the file and assign them to a dictionary {column_name: column_number}
                if linenum == 1:
                    header = {name.strip(): col for col, name in enumerate(line)}
                    print(header)                    
                    continue

                


                # set variables
                pgkb = line[header["PharmGKB Accession Id"]]

                if "NCBI Gene ID" in header:
                    ncbi = line[header["NCBI Gene ID"]]
                if 'External Vocabulary'in header:
                    if len(line) > header['External Vocabulary']: # If a line doesn't have data on the end, it wont be in the delimination
                        external = str(line[header['External Vocabulary']]).replace('"', "") # weird bug where quotes break regex
                        match = re.match('MESH:[0-9A-Za-z]+',external,re.IGNORECASE)
                        if match != None:
                            mesh = match.group(0).upper()

                # fill map   
                values = {PHARMGKB_VALUE: pgkb}                
                if ncbi is not None:
                    values[NCBI_VALUE] = ncbi
                if mesh is not None:
                    values[MESH_VALUE] = mesh
        
                id_map[pgkb] = values
                
                if ncbi is not None:
                    id_map[ncbi] = values
                if mesh is not None:
                    id_map[mesh] = values
                

        except Exception as e:
            print(line)
            raise e

{'PharmGKB Accession Id': 0, 'Name': 1, 'Generic Names': 2, 'Trade Names': 3, 'Brand Mixtures': 4, 'Type': 5, 'Cross-references': 6, 'SMILES': 7, 'InChI': 8, 'Dosing Guideline': 9, 'External Vocabulary': 10, 'Clinical Annotation Count': 11, 'Variant Annotation Count': 12, 'Pathway Count': 13, 'VIP Count': 14, 'Dosing Guideline Sources': 15, 'Top Clinical Annotation Level': 16, 'Top FDA Label Testing Level': 17, 'Top Any Drug Label Testing Level': 18, 'Label Has Dosing Info': 19, 'Has Rx Annotation': 20}
{'PharmGKB Accession Id': 0, 'Name': 1, 'Generic Names': 2, 'Trade Names': 3, 'Brand Mixtures': 4, 'Type': 5, 'Cross-references': 6, 'SMILES': 7, 'InChI': 8, 'Dosing Guideline': 9, 'External Vocabulary': 10, 'Clinical Annotation Count': 11, 'Variant Annotation Count': 12, 'Pathway Count': 13, 'VIP Count': 14, 'Dosing Guideline Sources': 15, 'Top Clinical Annotation Level': 16, 'Top FDA Label Testing Level': 17, 'Top Any Drug Label Testing Level': 18, 'Label Has Dosing Info': 19, 'Has Rx

In [7]:
# Just to test
print(json.dumps(data['1000']))

{"type": "g"}


### Go through each ID and find all alternative values

In [8]:


def getType(string):
    if string[:2] in 'PA':
        return PHARMGKB_VALUE
    elif string[:5] in 'MESH:':
        return MESH_VALUE
    else:
        return NCBI_VALUE

count = 0
for value in log_progress(data, every=1000, name="IDs looked up "):

    try:
        ncbi_value = None
        pharmgkb_value = None
        mesh_value = None

        data_values = data.values()
        
        # Get ID type
        if value in id_map:
            map_data = id_map[value]
        

            # Lookup NCBI
            if NCBI_VALUE in map_data:
                ncbi_value = map_data[NCBI_VALUE]
                data[value][NCBI_VALUE] = ncbi_value

            # lookup PharmGKB
            if PHARMGKB_VALUE in map_data:
                pharmgkb_value = map_data[PHARMGKB_VALUE]
                data[value][PHARMGKB_VALUE] = pharmgkb_value

            # Lookup MESH
            if MESH_VALUE in map_data:
                mesh_value = map_data[MESH_VALUE]
                data[value][MESH_VALUE] = mesh_value

        if value not in data_values:
            data[value][getType(value)] = value
            # print(value, data[value])

        # If no alias found
        if len(data[value]) == 2:
            print(value, data[value])
        else:
            count+=1
    except Exception as e:
        print(value, str(e))
    

print("Total with aliases:", count, "/", len(data))

VBox(children=(HTML(value=''), IntProgress(value=0, max=25021)))

100529141 {'type': 'g', 'ncbi': '100529141'}
155060 {'type': 'g', 'ncbi': '155060'}
339010 {'type': 'g', 'ncbi': '339010'}
441330 {'type': 'g', 'ncbi': '441330'}
641590 {'type': 'g', 'ncbi': '641590'}
654341 {'type': 'g', 'ncbi': '654341'}
85295 {'type': 'g', 'ncbi': '85295'}
MESH:C531600 {'type': 'd', 'mesh': 'MESH:C531600'}
MESH:C531616 {'type': 'd', 'mesh': 'MESH:C531616'}
MESH:C531617 {'type': 'd', 'mesh': 'MESH:C531617'}
MESH:C531621 {'type': 'd', 'mesh': 'MESH:C531621'}
MESH:C531625 {'type': 'd', 'mesh': 'MESH:C531625'}
MESH:C531673 {'type': 'd', 'mesh': 'MESH:C531673'}
MESH:C531736 {'type': 'd', 'mesh': 'MESH:C531736'}
MESH:C531760 {'type': 'd', 'mesh': 'MESH:C531760'}
MESH:C531762 {'type': 'd', 'mesh': 'MESH:C531762'}
MESH:C531777 {'type': 'd', 'mesh': 'MESH:C531777'}
MESH:C531816 {'type': 'd', 'mesh': 'MESH:C531816'}
MESH:C531835 {'type': 'd', 'mesh': 'MESH:C531835'}
MESH:C531844 {'type': 'd', 'mesh': 'MESH:C531844'}
MESH:C535282 {'type': 'd', 'mesh': 'MESH:C535282'}
MESH:C535

MESH:C536334 {'type': 'd', 'mesh': 'MESH:C536334'}
MESH:C536335 {'type': 'd', 'mesh': 'MESH:C536335'}
MESH:C536353 {'type': 'd', 'mesh': 'MESH:C536353'}
MESH:C536357 {'type': 'd', 'mesh': 'MESH:C536357'}
MESH:C536358 {'type': 'd', 'mesh': 'MESH:C536358'}
MESH:C536376 {'type': 'd', 'mesh': 'MESH:C536376'}
MESH:C536382 {'type': 'd', 'mesh': 'MESH:C536382'}
MESH:C536385 {'type': 'd', 'mesh': 'MESH:C536385'}
MESH:C536390 {'type': 'd', 'mesh': 'MESH:C536390'}
MESH:C536397 {'type': 'd', 'mesh': 'MESH:C536397'}
MESH:C536399 {'type': 'd', 'mesh': 'MESH:C536399'}
MESH:C536404 {'type': 'd', 'mesh': 'MESH:C536404'}
MESH:C536411 {'type': 'd', 'mesh': 'MESH:C536411'}
MESH:C536416 {'type': 'd', 'mesh': 'MESH:C536416'}
MESH:C536426 {'type': 'd', 'mesh': 'MESH:C536426'}
MESH:C536436 {'type': 'd', 'mesh': 'MESH:C536436'}
MESH:C536438 {'type': 'd', 'mesh': 'MESH:C536438'}
MESH:C536439 {'type': 'd', 'mesh': 'MESH:C536439'}
MESH:C536447 {'type': 'd', 'mesh': 'MESH:C536447'}
MESH:C536453 {'type': 'd', 'mes

MESH:C538343 {'type': 'd', 'mesh': 'MESH:C538343'}
MESH:C538346 {'type': 'd', 'mesh': 'MESH:C538346'}
MESH:C538355 {'type': 'd', 'mesh': 'MESH:C538355'}
MESH:C538361 {'type': 'd', 'mesh': 'MESH:C538361'}
MESH:C538362 {'type': 'd', 'mesh': 'MESH:C538362'}
MESH:C538363 {'type': 'd', 'mesh': 'MESH:C538363'}
MESH:C538365 {'type': 'd', 'mesh': 'MESH:C538365'}
MESH:C538375 {'type': 'd', 'mesh': 'MESH:C538375'}
MESH:C538380 {'type': 'd', 'mesh': 'MESH:C538380'}
MESH:C538381 {'type': 'd', 'mesh': 'MESH:C538381'}
MESH:C538384 {'type': 'd', 'mesh': 'MESH:C538384'}
MESH:C538394 {'type': 'd', 'mesh': 'MESH:C538394'}
MESH:C538400 {'type': 'd', 'mesh': 'MESH:C538400'}
MESH:C538445 {'type': 'd', 'mesh': 'MESH:C538445'}
MESH:C538457 {'type': 'd', 'mesh': 'MESH:C538457'}
MESH:C538475 {'type': 'd', 'mesh': 'MESH:C538475'}
MESH:C538481 {'type': 'd', 'mesh': 'MESH:C538481'}
MESH:C538494 {'type': 'd', 'mesh': 'MESH:C538494'}
MESH:C538507 {'type': 'd', 'mesh': 'MESH:C538507'}
MESH:C538525 {'type': 'd', 'mes

MESH:C566452 {'type': 'd', 'mesh': 'MESH:C566452'}
MESH:C566454 {'type': 'd', 'mesh': 'MESH:C566454'}
MESH:C566465 {'type': 'd', 'mesh': 'MESH:C566465'}
MESH:C566476 {'type': 'd', 'mesh': 'MESH:C566476'}
MESH:C566478 {'type': 'd', 'mesh': 'MESH:C566478'}
MESH:C566504 {'type': 'd', 'mesh': 'MESH:C566504'}
MESH:C566528 {'type': 'd', 'mesh': 'MESH:C566528'}
MESH:C566555 {'type': 'd', 'mesh': 'MESH:C566555'}
MESH:C566582 {'type': 'd', 'mesh': 'MESH:C566582'}
MESH:C566593 {'type': 'd', 'mesh': 'MESH:C566593'}
MESH:C566600 {'type': 'd', 'mesh': 'MESH:C566600'}
MESH:C566618 {'type': 'd', 'mesh': 'MESH:C566618'}
MESH:C566619 {'type': 'd', 'mesh': 'MESH:C566619'}
MESH:C566719 {'type': 'd', 'mesh': 'MESH:C566719'}
MESH:C566730 {'type': 'd', 'mesh': 'MESH:C566730'}
MESH:C566766 {'type': 'd', 'mesh': 'MESH:C566766'}
MESH:C566800 {'type': 'd', 'mesh': 'MESH:C566800'}
MESH:C566801 {'type': 'd', 'mesh': 'MESH:C566801'}
MESH:C566815 {'type': 'd', 'mesh': 'MESH:C566815'}
MESH:C566822 {'type': 'd', 'mes

MESH:D002916 {'type': 'd', 'mesh': 'MESH:D002916'}
MESH:D003048 {'type': 'd', 'mesh': 'MESH:D003048'}
MESH:D003072 {'type': 'd', 'mesh': 'MESH:D003072'}
MESH:D003074 {'type': 'd', 'mesh': 'MESH:D003074'}
MESH:D003161 {'type': 'd', 'mesh': 'MESH:D003161'}
MESH:D003233 {'type': 'd', 'mesh': 'MESH:D003233'}
MESH:D003291 {'type': 'd', 'mesh': 'MESH:D003291'}
MESH:D003323 {'type': 'd', 'mesh': 'MESH:D003323'}
MESH:D003444 {'type': 'd', 'mesh': 'MESH:D003444'}
MESH:D003551 {'type': 'd', 'mesh': 'MESH:D003551'}
MESH:D003554 {'type': 'd', 'mesh': 'MESH:D003554'}
MESH:D003668 {'type': 'd', 'mesh': 'MESH:D003668'}
MESH:D003699 {'type': 'd', 'mesh': 'MESH:D003699'}
MESH:D003744 {'type': 'd', 'mesh': 'MESH:D003744'}
MESH:D003773 {'type': 'd', 'mesh': 'MESH:D003773'}
MESH:D003803 {'type': 'd', 'mesh': 'MESH:D003803'}
MESH:D003807 {'type': 'd', 'mesh': 'MESH:D003807'}
MESH:D003921 {'type': 'd', 'mesh': 'MESH:D003921'}
MESH:D004194 {'type': 'd', 'mesh': 'MESH:D004194'}
MESH:D004195 {'type': 'd', 'mes

MESH:D010483 {'type': 'd', 'mesh': 'MESH:D010483'}
MESH:D010485 {'type': 'd', 'mesh': 'MESH:D010485'}
MESH:D010509 {'type': 'd', 'mesh': 'MESH:D010509'}
MESH:D010522 {'type': 'd', 'mesh': 'MESH:D010522'}
MESH:D010524 {'type': 'd', 'mesh': 'MESH:D010524'}
MESH:D010688 {'type': 'd', 'mesh': 'MESH:D010688'}
MESH:D010842 {'type': 'd', 'mesh': 'MESH:D010842'}
MESH:D010850 {'type': 'd', 'mesh': 'MESH:D010850'}
MESH:D010916 {'type': 'd', 'mesh': 'MESH:D010916'}
MESH:D010921 {'type': 'd', 'mesh': 'MESH:D010921'}
MESH:D010922 {'type': 'd', 'mesh': 'MESH:D010922'}
MESH:D011007 {'type': 'd', 'mesh': 'MESH:D011007'}
MESH:D011008 {'type': 'd', 'mesh': 'MESH:D011008'}
MESH:D011017 {'type': 'd', 'mesh': 'MESH:D011017'}
MESH:D011018 {'type': 'd', 'mesh': 'MESH:D011018'}
MESH:D011019 {'type': 'd', 'mesh': 'MESH:D011019'}
MESH:D011023 {'type': 'd', 'mesh': 'MESH:D011023'}
MESH:D011026 {'type': 'd', 'mesh': 'MESH:D011026'}
MESH:D011027 {'type': 'd', 'mesh': 'MESH:D011027'}
MESH:D011123 {'type': 'd', 'mes

MESH:D017714 {'type': 'd', 'mesh': 'MESH:D017714'}
MESH:D017719 {'type': 'd', 'mesh': 'MESH:D017719'}
MESH:D017731 {'type': 'd', 'mesh': 'MESH:D017731'}
MESH:D017759 {'type': 'd', 'mesh': 'MESH:D017759'}
MESH:D017769 {'type': 'd', 'mesh': 'MESH:D017769'}
MESH:D017823 {'type': 'd', 'mesh': 'MESH:D017823'}
MESH:D017887 {'type': 'd', 'mesh': 'MESH:D017887'}
MESH:D018177 {'type': 'd', 'mesh': 'MESH:D018177'}
MESH:D018182 {'type': 'd', 'mesh': 'MESH:D018182'}
MESH:D018184 {'type': 'd', 'mesh': 'MESH:D018184'}
MESH:D018185 {'type': 'd', 'mesh': 'MESH:D018185'}
MESH:D018186 {'type': 'd', 'mesh': 'MESH:D018186'}
MESH:D018194 {'type': 'd', 'mesh': 'MESH:D018194'}
MESH:D018195 {'type': 'd', 'mesh': 'MESH:D018195'}
MESH:D018202 {'type': 'd', 'mesh': 'MESH:D018202'}
MESH:D018209 {'type': 'd', 'mesh': 'MESH:D018209'}
MESH:D018212 {'type': 'd', 'mesh': 'MESH:D018212'}
MESH:D018215 {'type': 'd', 'mesh': 'MESH:D018215'}
MESH:D018220 {'type': 'd', 'mesh': 'MESH:D018220'}
MESH:D018223 {'type': 'd', 'mes

MESH:D055397 {'type': 'd', 'mesh': 'MESH:D055397'}
MESH:D055534 {'type': 'd', 'mesh': 'MESH:D055534'}
MESH:D055577 {'type': 'd', 'mesh': 'MESH:D055577'}
MESH:D055613 {'type': 'd', 'mesh': 'MESH:D055613'}
MESH:D055623 {'type': 'd', 'mesh': 'MESH:D055623'}
MESH:D055673 {'type': 'd', 'mesh': 'MESH:D055673'}
MESH:D055732 {'type': 'd', 'mesh': 'MESH:D055732'}
MESH:D055948 {'type': 'd', 'mesh': 'MESH:D055948'}
MESH:D055953 {'type': 'd', 'mesh': 'MESH:D055953'}
MESH:D055959 {'type': 'd', 'mesh': 'MESH:D055959'}
MESH:D055963 {'type': 'd', 'mesh': 'MESH:D055963'}
MESH:D055985 {'type': 'd', 'mesh': 'MESH:D055985'}
MESH:D056005 {'type': 'd', 'mesh': 'MESH:D056005'}
MESH:D056006 {'type': 'd', 'mesh': 'MESH:D056006'}
MESH:D056128 {'type': 'd', 'mesh': 'MESH:D056128'}
MESH:D056151 {'type': 'd', 'mesh': 'MESH:D056151'}
MESH:D056266 {'type': 'd', 'mesh': 'MESH:D056266'}
MESH:D056267 {'type': 'd', 'mesh': 'MESH:D056267'}
MESH:D056304 {'type': 'd', 'mesh': 'MESH:D056304'}
MESH:D056305 {'type': 'd', 'mes

PA150595617 {'type': 'c', 'pgkb': 'PA150595617'}
PA151186253 {'type': 'c', 'pgkb': 'PA151186253'}
PA151249535 {'type': 'c', 'pgkb': 'PA151249535'}
PA151917011 {'type': 'r', 'pgkb': 'PA151917011'}
PA151917012 {'type': 'c', 'pgkb': 'PA151917012'}
PA151958300 {'type': 'r', 'pgkb': 'PA151958300'}
PA151958360 {'type': 'r', 'pgkb': 'PA151958360'}
PA151958426 {'type': 'r', 'pgkb': 'PA151958426'}
PA151958596 {'type': 'c', 'pgkb': 'PA151958596'}
PA152031327 {'type': 'c', 'pgkb': 'PA152031327'}
PA152432599 {'type': 'c', 'pgkb': 'PA152432599'}
PA152530737 {'type': 'r', 'pgkb': 'PA152530737'}
PA152530738 {'type': 'r', 'pgkb': 'PA152530738'}
PA153561371 {'type': 'c', 'pgkb': 'PA153561371'}
PA153590860 {'type': 'r', 'pgkb': 'PA153590860'}
PA153619853 {'type': 'r', 'pgkb': 'PA153619853'}
PA153906318 {'type': 'd', 'pgkb': 'PA153906318'}
PA153906323 {'type': 'r', 'pgkb': 'PA153906323'}
PA154081778 {'type': 'r', 'pgkb': 'PA154081778'}
PA154081779 {'type': 'r', 'pgkb': 'PA154081779'}
PA154410481 {'type':

PA164781398 {'type': 'r', 'pgkb': 'PA164781398'}
PA164781399 {'type': 'r', 'pgkb': 'PA164781399'}
PA164783958 {'type': 'c', 'pgkb': 'PA164783958'}
PA164783990 {'type': 'c', 'pgkb': 'PA164783990'}
PA164783999 {'type': 'r', 'pgkb': 'PA164783999'}
PA164784021 {'type': 'c', 'pgkb': 'PA164784021'}
PA164784023 {'type': 'r', 'pgkb': 'PA164784023'}
PA164784024 {'type': 'c', 'pgkb': 'PA164784024'}
PA164784026 {'type': 'r', 'pgkb': 'PA164784026'}
PA164784030 {'type': 'r', 'pgkb': 'PA164784030'}
PA164784033 {'type': 'r', 'pgkb': 'PA164784033'}
PA164784034 {'type': 'r', 'pgkb': 'PA164784034'}
PA164888966 {'type': 'c', 'pgkb': 'PA164888966'}
PA164924487 {'type': 'c', 'pgkb': 'PA164924487'}
PA164924491 {'type': 'r', 'pgkb': 'PA164924491'}
PA164924492 {'type': 'c', 'pgkb': 'PA164924492'}
PA164924493 {'type': 'c', 'pgkb': 'PA164924493'}
PA164924561 {'type': 'c', 'pgkb': 'PA164924561'}
PA164925725 {'type': 'd', 'pgkb': 'PA164925725'}
PA165107055 {'type': 'c', 'pgkb': 'PA165107055'}
PA165107991 {'type':

PA165947871 {'type': 'd', 'pgkb': 'PA165947871'}
PA165947872 {'type': 'd', 'pgkb': 'PA165947872'}
PA165947873 {'type': 'd', 'pgkb': 'PA165947873'}
PA165947874 {'type': 'd', 'pgkb': 'PA165947874'}
PA165947875 {'type': 'd', 'pgkb': 'PA165947875'}
PA165947876 {'type': 'd', 'pgkb': 'PA165947876'}
PA165947877 {'type': 'd', 'pgkb': 'PA165947877'}
PA165948031 {'type': 'r', 'pgkb': 'PA165948031'}
PA165948032 {'type': 'd', 'pgkb': 'PA165948032'}
PA165948039 {'type': 'd', 'pgkb': 'PA165948039'}
PA165948040 {'type': 'd', 'pgkb': 'PA165948040'}
PA165948041 {'type': 'd', 'pgkb': 'PA165948041'}
PA165948045 {'type': 'd', 'pgkb': 'PA165948045'}
PA165948047 {'type': 'd', 'pgkb': 'PA165948047'}
PA165948052 {'type': 'r', 'pgkb': 'PA165948052'}
PA165948092 {'type': 'd', 'pgkb': 'PA165948092'}
PA165948137 {'type': 'd', 'pgkb': 'PA165948137'}
PA165948138 {'type': 'd', 'pgkb': 'PA165948138'}
PA165948139 {'type': 'd', 'pgkb': 'PA165948139'}
PA165948317 {'type': 'd', 'pgkb': 'PA165948317'}
PA165948318 {'type':

PA165958361 {'type': 'r', 'pgkb': 'PA165958361'}
PA165958362 {'type': 'r', 'pgkb': 'PA165958362'}
PA165958363 {'type': 'c', 'pgkb': 'PA165958363'}
PA165958364 {'type': 'c', 'pgkb': 'PA165958364'}
PA165958365 {'type': 'c', 'pgkb': 'PA165958365'}
PA165958370 {'type': 'r', 'pgkb': 'PA165958370'}
PA165958374 {'type': 'c', 'pgkb': 'PA165958374'}
PA165958376 {'type': 'c', 'pgkb': 'PA165958376'}
PA165958377 {'type': 'r', 'pgkb': 'PA165958377'}
PA165958379 {'type': 'r', 'pgkb': 'PA165958379'}
PA165958382 {'type': 'c', 'pgkb': 'PA165958382'}
PA165958383 {'type': 'r', 'pgkb': 'PA165958383'}
PA165958385 {'type': 'r', 'pgkb': 'PA165958385'}
PA165958390 {'type': 'r', 'pgkb': 'PA165958390'}
PA165958395 {'type': 'c', 'pgkb': 'PA165958395'}
PA165958398 {'type': 'r', 'pgkb': 'PA165958398'}
PA165958401 {'type': 'c', 'pgkb': 'PA165958401'}
PA165958404 {'type': 'r', 'pgkb': 'PA165958404'}
PA165958405 {'type': 'r', 'pgkb': 'PA165958405'}
PA165958406 {'type': 'c', 'pgkb': 'PA165958406'}
PA165958407 {'type':

PA166049174 {'type': 'r', 'pgkb': 'PA166049174'}
PA166049175 {'type': 'r', 'pgkb': 'PA166049175'}
PA166049180 {'type': 'r', 'pgkb': 'PA166049180'}
PA166049185 {'type': 'r', 'pgkb': 'PA166049185'}
PA166049186 {'type': 'r', 'pgkb': 'PA166049186'}
PA166049193 {'type': 'r', 'pgkb': 'PA166049193'}
PA166104276 {'type': 'c', 'pgkb': 'PA166104276'}
PA166104279 {'type': 'r', 'pgkb': 'PA166104279'}
PA166104758 {'type': 'd', 'pgkb': 'PA166104758'}
PA166104763 {'type': 'd', 'pgkb': 'PA166104763'}
PA166107796 {'type': 'd', 'pgkb': 'PA166107796'}
PA166109554 {'type': 'c', 'pgkb': 'PA166109554'}
PA166110035 {'type': 'd', 'pgkb': 'PA166110035'}
PA166110036 {'type': 'd', 'pgkb': 'PA166110036'}
PA166110037 {'type': 'd', 'pgkb': 'PA166110037'}
PA166110038 {'type': 'd', 'pgkb': 'PA166110038'}
PA166110039 {'type': 'd', 'pgkb': 'PA166110039'}
PA166110254 {'type': 'r', 'pgkb': 'PA166110254'}
PA166110255 {'type': 'c', 'pgkb': 'PA166110255'}
PA166110256 {'type': 'c', 'pgkb': 'PA166110256'}
PA166110257 {'type':

PA166123331 {'type': 'd', 'pgkb': 'PA166123331'}
PA166123332 {'type': 'd', 'pgkb': 'PA166123332'}
PA166123333 {'type': 'd', 'pgkb': 'PA166123333'}
PA166123336 {'type': 'd', 'pgkb': 'PA166123336'}
PA166123337 {'type': 'd', 'pgkb': 'PA166123337'}
PA166123338 {'type': 'd', 'pgkb': 'PA166123338'}
PA166123339 {'type': 'd', 'pgkb': 'PA166123339'}
PA166123340 {'type': 'd', 'pgkb': 'PA166123340'}
PA166123341 {'type': 'd', 'pgkb': 'PA166123341'}
PA166123342 {'type': 'd', 'pgkb': 'PA166123342'}
PA166123343 {'type': 'd', 'pgkb': 'PA166123343'}
PA166123344 {'type': 'd', 'pgkb': 'PA166123344'}
PA166123345 {'type': 'd', 'pgkb': 'PA166123345'}
PA166123346 {'type': 'c', 'pgkb': 'PA166123346'}
PA166123366 {'type': 'd', 'pgkb': 'PA166123366'}
PA166123367 {'type': 'c', 'pgkb': 'PA166123367'}
PA166123368 {'type': 'd', 'pgkb': 'PA166123368'}
PA166123369 {'type': 'd', 'pgkb': 'PA166123369'}
PA166123389 {'type': 'c', 'pgkb': 'PA166123389'}
PA166123407 {'type': 'c', 'pgkb': 'PA166123407'}
PA166123425 {'type':

PA166131619 {'type': 'd', 'pgkb': 'PA166131619'}
PA166131620 {'type': 'd', 'pgkb': 'PA166131620'}
PA166131621 {'type': 'd', 'pgkb': 'PA166131621'}
PA166131622 {'type': 'd', 'pgkb': 'PA166131622'}
PA166131623 {'type': 'd', 'pgkb': 'PA166131623'}
PA166131628 {'type': 'd', 'pgkb': 'PA166131628'}
PA166131629 {'type': 'd', 'pgkb': 'PA166131629'}
PA166131630 {'type': 'c', 'pgkb': 'PA166131630'}
PA166131631 {'type': 'd', 'pgkb': 'PA166131631'}
PA166151814 {'type': 'd', 'pgkb': 'PA166151814'}
PA166151827 {'type': 'd', 'pgkb': 'PA166151827'}
PA166151992 {'type': 'c', 'pgkb': 'PA166151992'}
PA166152644 {'type': 'd', 'pgkb': 'PA166152644'}
PA166152829 {'type': 'c', 'pgkb': 'PA166152829'}
PA166152838 {'type': 'c', 'pgkb': 'PA166152838'}
PA166152848 {'type': 'd', 'pgkb': 'PA166152848'}
PA166152901 {'type': 'c', 'pgkb': 'PA166152901'}
PA166152935 {'type': 'c', 'pgkb': 'PA166152935'}
PA166152940 {'type': 'c', 'pgkb': 'PA166152940'}
PA166153136 {'type': 'd', 'pgkb': 'PA166153136'}
PA166153171 {'type':

PA449045 {'type': 'r', 'pgkb': 'PA449045'}
PA449048 {'type': 'c', 'pgkb': 'PA449048'}
PA449050 {'type': 'r', 'pgkb': 'PA449050'}
PA449051 {'type': 'c', 'pgkb': 'PA449051'}
PA449059 {'type': 'r', 'pgkb': 'PA449059'}
PA449061 {'type': 'c', 'pgkb': 'PA449061'}
PA449072 {'type': 'c', 'pgkb': 'PA449072'}
PA449088 {'type': 'c', 'pgkb': 'PA449088'}
PA449092 {'type': 'c', 'pgkb': 'PA449092'}
PA449095 {'type': 'r', 'pgkb': 'PA449095'}
PA449096 {'type': 'r', 'pgkb': 'PA449096'}
PA449107 {'type': 'r', 'pgkb': 'PA449107'}
PA449114 {'type': 'r', 'pgkb': 'PA449114'}
PA449165 {'type': 'c', 'pgkb': 'PA449165'}
PA449167 {'type': 'c', 'pgkb': 'PA449167'}
PA449171 {'type': 'c', 'pgkb': 'PA449171'}
PA449173 {'type': 'r', 'pgkb': 'PA449173'}
PA449176 {'type': 'r', 'pgkb': 'PA449176'}
PA449177 {'type': 'c', 'pgkb': 'PA449177'}
PA449197 {'type': 'c', 'pgkb': 'PA449197'}
PA449208 {'type': 'r', 'pgkb': 'PA449208'}
PA449211 {'type': 'c', 'pgkb': 'PA449211'}
PA449212 {'type': 'c', 'pgkb': 'PA449212'}
PA449223 {'

PA451140 {'type': 'c', 'pgkb': 'PA451140'}
PA451141 {'type': 'c', 'pgkb': 'PA451141'}
PA451142 {'type': 'r', 'pgkb': 'PA451142'}
PA451156 {'type': 'c', 'pgkb': 'PA451156'}
PA451159 {'type': 'c', 'pgkb': 'PA451159'}
PA451170 {'type': 'r', 'pgkb': 'PA451170'}
PA451182 {'type': 'c', 'pgkb': 'PA451182'}
PA451185 {'type': 'r', 'pgkb': 'PA451185'}
PA451193 {'type': 'c', 'pgkb': 'PA451193'}
PA451213 {'type': 'c', 'pgkb': 'PA451213'}
PA451221 {'type': 'c', 'pgkb': 'PA451221'}
PA451223 {'type': 'c', 'pgkb': 'PA451223'}
PA451224 {'type': 'c', 'pgkb': 'PA451224'}
PA451236 {'type': 'r', 'pgkb': 'PA451236'}
PA451241 {'type': 'c', 'pgkb': 'PA451241'}
PA451249 {'type': 'r', 'pgkb': 'PA451249'}
PA451250 {'type': 'c', 'pgkb': 'PA451250'}
PA451251 {'type': 'r', 'pgkb': 'PA451251'}
PA451257 {'type': 'c', 'pgkb': 'PA451257'}
PA451258 {'type': 'c', 'pgkb': 'PA451258'}
PA451260 {'type': 'c', 'pgkb': 'PA451260'}
PA451261 {'type': 'c', 'pgkb': 'PA451261'}
PA451271 {'type': 'r', 'pgkb': 'PA451271'}
PA451299 {'

In [9]:
data["155060"]

{'type': 'g', 'ncbi': '155060'}

### Add data to database

In [10]:
ALIASES_WRITE = '''INSERT OR IGNORE INTO {table} (
                {col1}, {col2}, {col3}, {col4}, {col5} )
                VALUES ( ? , ? , ? , ? , ? );'''.format(
            table = TABLE_NAME, 
            col1=MESH_VALUE,
            col2=PHARMGKB_VALUE,
            col3=NCBI_VALUE,
            col4=VALUES_COLUMN,
            col5=TYPE_COLUMN)

if '' in data:
    del data['']

for key in log_progress(data, every=1000, name="Added to DB"):
    try:
        values = data[key]
        values_str = json.dumps(values)
        values_type = values["type"]
        
        
        mesh_val = None
        pgkb_val = None
        ncbi_val = None
        
        if MESH_VALUE in values:
            mesh_val = values[MESH_VALUE]
        
        if PHARMGKB_VALUE in values:
            pgkb_val = values[PHARMGKB_VALUE]
            
        if NCBI_VALUE in values:
            ncbi_val = values[NCBI_VALUE]

    
        cursor_all.execute(ALIASES_WRITE,(mesh_val, pgkb_val, ncbi_val, values_str, values_type))        

    except Exception as e:
        print(key, values)
        raise e
if WRITE:
    conn_all.commit()
    
print("Data committed")

if WRITE:
    with open("all_aliases_for_mike.json", "w+") as file:
        file.write(json.dumps( data ))
        
print("file written")

VBox(children=(HTML(value=''), IntProgress(value=0, max=25021)))

Data committed
file written
