# Alternative ID Generation Script

This will produce all the various IDs that can be used to lookup in different databases.

In [1]:
# Progress Bar I found on the internet.
# https://github.com/alexanderkuk/log-progress
from progress_bar import log_progress
import re
import sqlite3
import json
from shutil import copy2

In [2]:
WRITE = True
DROP_AND_REMAKE = True

# TSV files containing Pharm-GKB data
PHARMGKB_INTERACTIONS_FILE          = 'tsv_data/pharmgkb/relationships.tsv'
PHARMGKB_CHEMICAL_IDS_FILE          = 'tsv_data/pharmgkb/ids/chemicals.tsv'
PHARMGKB_DRUGS_IDS_FILE             = 'tsv_data/pharmgkb/ids/drugs.tsv'
PHARMGKB_GENES_IDS_FILE             = 'tsv_data/pharmgkb/ids/genes.tsv'
PHARMGKB_PHENOTYPES_IDS_FILE        = 'tsv_data/pharmgkb/ids/phenotypes.tsv'

PHARMGKB_ID_FILES = [
     {"filename":PHARMGKB_CHEMICAL_IDS_FILE},
     {"filename":PHARMGKB_DRUGS_IDS_FILE},
    {"filename":PHARMGKB_GENES_IDS_FILE},
     {"filename":PHARMGKB_PHENOTYPES_IDS_FILE},
]

TABLE_NAME = "alternative_ids"
ID_COLUMN = "id"
VALUES_COLUMN = "vals"
TYPE_COLUMN = "type"
DROP_SQL = 'DROP TABLE {table};'.format(table = TABLE_NAME)
SCHEMA = 'CREATE TABLE IF NOT EXISTS {table} ({col1} VARCHAR PRIMARY KEY UNIQUE NOT NULL, {col2} VARCHAR);'.format(table = TABLE_NAME, col1=ID_COLUMN, col2=VALUES_COLUMN, col3=TYPE_COLUMN)
QUERY_UNIQUE_IDS = "SELECT distinct geneids1, {type_col}1 from interactions union select distinct geneids2, {type_col}2 from interactions;".format(type_col = TYPE_COLUMN)

PLOS_PMC_DB = 'sqlite_data/data.plos-pmc.sqlite'
ALL_DB = 'sqlite_data/data.all.sqlite'

MESH_VALUE = "mesh"
PHARMGKB_VALUE = "pgkb"
NCBI_VALUE = "ncbi"

DELIMITER = "\t"


### Connect to Databases

In [3]:
conn_all = sqlite3.connect(ALL_DB)
cursor_all = conn_all.cursor()

### Create table

In [4]:
if DROP_AND_REMAKE:
    cursor_all.execute(DROP_SQL)
cursor_all.execute(SCHEMA)

    

<sqlite3.Cursor at 0x51a7c60>

### Gather Unique IDs into Hash Map
The index is the id, the value is an object.

In [5]:
data = {}

print("Executing SQL query. May take a minute.")
cursor = cursor_all.execute(QUERY_UNIQUE_IDS)
interactions = cursor.fetchall()
print("Query complete")

for row in log_progress(interactions, every=100, name="IDs added"):

    if row == None:
        continue

    
    id = row[0]
    dgr_type = row[1]

    if id not in data:
        data[id] = { TYPE_COLUMN : dgr_type}
        
print("Unique IDs added to data")

Executing SQL query. May take a minute.
Query complete


VBox(children=(HTML(value=''), IntProgress(value=0, max=24377)))

Unique IDs added to data


## Create lookup table for other values

In [8]:
id_map = {}

for id_file in PHARMGKB_ID_FILES:
    with open(id_file["filename"]) as file:
        try:
            header = None
            linenum = 0
            for line in file:
                linenum+=1
                pgkb = None
                ncbi = None
                mesh = None

                # deliminate the lines
                line = line.strip().split(DELIMITER)

                # Read the headers of the file and assign them to a dictionary {column_name: column_number}
                if linenum == 1:
                    header = {name.strip(): col for col, name in enumerate(line)}
                    print(header)                    
                    continue

                


                # set variables
                pgkb = line[header["PharmGKB Accession Id"]]

                if "NCBI Gene ID" in header:
                    ncbi = line[header["NCBI Gene ID"]]
                if 'External Vocabulary'in header:
                    if len(line) > header['External Vocabulary']: # If a line doesn't have data on the end, it wont be in the delimination
                        external = str(line[header['External Vocabulary']]).replace('"', "") # weird bug where quotes break regex
                        match = re.match('MESH:[0-9A-Za-z]+',external,re.IGNORECASE)
                        if match != None:
                            mesh = match.group(0).upper()

                # fill map   
                values = {PHARMGKB_VALUE: pgkb}                
                if ncbi is not None:
                    values[NCBI_VALUE] = ncbi
                if mesh is not None:
                    values[MESH_VALUE] = mesh
        
                id_map[pgkb] = values
                
                if ncbi is not None:
                    id_map[ncbi] = values
                if mesh is not None:
                    id_map[mesh] = values
                

        except Exception as e:
            print(line)
            raise e

{'PharmGKB Accession Id': 0, 'Name': 1, 'Generic Names': 2, 'Trade Names': 3, 'Brand Mixtures': 4, 'Type': 5, 'Cross-references': 6, 'SMILES': 7, 'InChI': 8, 'Dosing Guideline': 9, 'External Vocabulary': 10, 'Clinical Annotation Count': 11, 'Variant Annotation Count': 12, 'Pathway Count': 13, 'VIP Count': 14, 'Dosing Guideline Sources': 15, 'Top Clinical Annotation Level': 16, 'Top FDA Label Testing Level': 17, 'Top Any Drug Label Testing Level': 18, 'Label Has Dosing Info': 19, 'Has Rx Annotation': 20}
{'PharmGKB Accession Id': 0, 'Name': 1, 'Generic Names': 2, 'Trade Names': 3, 'Brand Mixtures': 4, 'Type': 5, 'Cross-references': 6, 'SMILES': 7, 'InChI': 8, 'Dosing Guideline': 9, 'External Vocabulary': 10, 'Clinical Annotation Count': 11, 'Variant Annotation Count': 12, 'Pathway Count': 13, 'VIP Count': 14, 'Dosing Guideline Sources': 15, 'Top Clinical Annotation Level': 16, 'Top FDA Label Testing Level': 17, 'Top Any Drug Label Testing Level': 18, 'Label Has Dosing Info': 19, 'Has Rx

In [9]:
# Just to test
print(json.dumps(data['1000']))

{"type": "g"}


### Go through each ID and find all alternative values

In [10]:
def getType(string):
    if string[:2] in 'PA':
        return PHARMGKB_VALUE
    elif string[:5] in 'MESH:':
        return MESH_VALUE
    else:
        return NCBI_VALUE
count = 0
for value in log_progress(data, every=1000, name="IDs looked up "):

    try:
        ncbi_value = None
        pharmgkb_value = None
        mesh_value = None

        # Get ID type
        map_data = id_map[value]

        # Lookup NCBI
        if NCBI_VALUE in map_data:
            ncbi_value = map_data[NCBI_VALUE]
            data[value][NCBI_VALUE] = ncbi_value

        # lookup PharmGKB
        if PHARMGKB_VALUE in map_data:
            pharmgkb_value = map_data[PHARMGKB_VALUE]
            data[value][PHARMGKB_VALUE] = pharmgkb_value

        # Lookup MESH
        if MESH_VALUE in map_data:
            mesh_value = map_data[MESH_VALUE]
            data[value][MESH_VALUE] = mesh_value

        if len(data[value]) == 0:
            print(value, data[value])
        else:
            count+=1
    except Exception as e:
        print(value)
    

print("Total with aliases:", count, "/", len(data))

VBox(children=(HTML(value=''), IntProgress(value=0, max=24377)))

100529141
155060
339010
441330
641590
654341
85295
MESH:C531600
MESH:C531616
MESH:C531617
MESH:C531621
MESH:C531625
MESH:C531673
MESH:C531736
MESH:C531760
MESH:C531762
MESH:C531777
MESH:C531816
MESH:C531835
MESH:C531844
MESH:C535282
MESH:C535288
MESH:C535289
MESH:C535297
MESH:C535308
MESH:C535310
MESH:C535311
MESH:C535314
MESH:C535318
MESH:C535326
MESH:C535331
MESH:C535338
MESH:C535342
MESH:C535358
MESH:C535372
MESH:C535377
MESH:C535380
MESH:C535382
MESH:C535396
MESH:C535397
MESH:C535413
MESH:C535416
MESH:C535417
MESH:C535418
MESH:C535431
MESH:C535434
MESH:C535436
MESH:C535440
MESH:C535456
MESH:C535460
MESH:C535463
MESH:C535466
MESH:C535473
MESH:C535474
MESH:C535475
MESH:C535476
MESH:C535477
MESH:C535480
MESH:C535483
MESH:C535484
MESH:C535494
MESH:C535500
MESH:C535506
MESH:C535507
MESH:C535509
MESH:C535516
MESH:C535523
MESH:C535526
MESH:C535531
MESH:C535533
MESH:C535536
MESH:C535540
MESH:C535553
MESH:C535555
MESH:C535566
MESH:C535568
MESH:C535572
MESH:C535575
MESH:C535579
MESH:C535589


MESH:D007757
MESH:D007870
MESH:D007876
MESH:D007906
MESH:D007911
MESH:D007955
MESH:D008039
MESH:D008048
MESH:D008068
MESH:D008118
MESH:D008141
MESH:D008200
MESH:D008205
MESH:D008207
MESH:D008219
MESH:D008230
MESH:D008303
MESH:D008337
MESH:D008342
MESH:D008375
MESH:D008414
MESH:D008444
MESH:D008446
MESH:D008478
MESH:D008480
MESH:D008532
MESH:D008554
MESH:D008577
MESH:D008582
MESH:D008585
MESH:D008796
MESH:D008947
MESH:D008998
MESH:D009057
MESH:D009058
MESH:D009059
MESH:D009078
MESH:D009084
MESH:D009085
MESH:D009087
MESH:D009091
MESH:D009188
MESH:D009198
MESH:D009208
MESH:D009209
MESH:D009216
MESH:D009221
MESH:D009264
MESH:D009335
MESH:D009378
MESH:D009379
MESH:D009382
MESH:D009394
MESH:D009402
MESH:D009463
MESH:D009631
MESH:D009668
MESH:D009669
MESH:D009794
MESH:D009808
MESH:D009810
MESH:D009839
MESH:D009855
MESH:D009958
MESH:D009999
MESH:D010000
MESH:D010005
MESH:D010014
MESH:D010017
MESH:D010148
MESH:D010181
MESH:D010237
MESH:D010255
MESH:D010262
MESH:D010282
MESH:D010304
MESH:D010305

PA165950263
PA165951131
PA165951133
PA165951139
PA165951184
PA165951273
PA165951275
PA165951292
PA165951320
PA165951340
PA165951348
PA165951391
PA165951493
PA165951494
PA165951495
PA165951496
PA165951497
PA165951498
PA165951499
PA165951500
PA165951504
PA165951511
PA165951517
PA165951530
PA165951542
PA165951644
PA165951669
PA165951674
PA165951678
PA165951734
PA165951757
PA165951763
PA165951930
PA165951968
PA165951978
PA165952135
PA165952142
PA165952269
PA165952289
PA165952297
PA165952366
PA165952368
PA165952388
PA165952408
PA165952460
PA165952475
PA165952609
PA165952754
PA165952829
PA165952839
PA165952843
PA165953186
PA165953358
PA165953507
PA165953564
PA165953622
PA165953634
PA165953847
PA165953964
PA165953969
PA165954030
PA165954144
PA165954150
PA165954243
PA165954348
PA165954366
PA165954643
PA165954649
PA165954712
PA165954745
PA165954769
PA165954785
PA165954787
PA165954792
PA165954793
PA165954803
PA165954807
PA165954811
PA165954812
PA165954816
PA165954823
PA165954826
PA165955030
PA16

### Add data to database

In [11]:
ALIASES_WRITE = 'INSERT OR REPLACE INTO {table} ( {col1}, {col2}) VALUES ( ?, ?);'.format(table=TABLE_NAME,col1=ID_COLUMN,col2=VALUES_COLUMN)

for key in log_progress(data, every=1000, name="Added to DB"):
    try:
        value = json.dumps(data[key])

    
        cursor_all.execute(ALIASES_WRITE,(key, value))        

    except Exception as e:
        print(value, data[key])
        raise e
if WRITE:
    conn_all.commit()
    
print("Data committed")

VBox(children=(HTML(value=''), IntProgress(value=0, max=24377)))

Data committed
