# Alternative ID Generation Script

This will produce all the various IDs that can be used to lookup in different databases.

In [None]:
# Progress Bar I found on the internet.
# https://github.com/alexanderkuk/log-progress
from progress_bar import log_progress
import re
import sqlite3
import json
from shutil import copy2

In [None]:
WRITE = True
DROP_AND_REMAKE = True

# TSV files containing Pharm-GKB data
PHARMGKB_INTERACTIONS_FILE          = 'tsv_data/pharmgkb/relationships.tsv'
PHARMGKB_CHEMICAL_IDS_FILE          = 'tsv_data/pharmgkb/ids/chemicals.tsv'
PHARMGKB_DRUGS_IDS_FILE             = 'tsv_data/pharmgkb/ids/drugs.tsv'
PHARMGKB_GENES_IDS_FILE             = 'tsv_data/pharmgkb/ids/genes.tsv'
PHARMGKB_PHENOTYPES_IDS_FILE        = 'tsv_data/pharmgkb/ids/phenotypes.tsv'

PHARMGKB_ID_FILES = [
     {"filename":PHARMGKB_CHEMICAL_IDS_FILE},
     {"filename":PHARMGKB_DRUGS_IDS_FILE},
    {"filename":PHARMGKB_GENES_IDS_FILE},
     {"filename":PHARMGKB_PHENOTYPES_IDS_FILE},
]

TABLE_NAME = "alternative_ids"
MESH_VALUE = "mesh"
PHARMGKB_VALUE = "pgkb"
NCBI_VALUE = "ncbi"
VALUES_COLUMN = "vals"
TYPE_COLUMN = "type"
DROP_SQL = 'DROP TABLE IF EXISTS {table};'.format(table = TABLE_NAME)
SCHEMA = '''CREATE TABLE IF NOT EXISTS {table} (
            {col1} VARCHAR UNIQUE,
            {col2} VARCHAR UNIQUE,
            {col3} VARCHAR UNIQUE,
            {col4} VARCHAR,
            {col5} VARCHAR,
            PRIMARY KEY ({col1}, {col2}, {col3})
            );'''.format(
            table = TABLE_NAME, 
            col1=MESH_VALUE,
            col2=PHARMGKB_VALUE,
            col3=NCBI_VALUE,
            col4=VALUES_COLUMN,
            col5=TYPE_COLUMN)
QUERY_UNIQUE_IDS = '''  SELECT DISTINCT geneids1,
                        {type_col}1 FROM interactions
                            union
                        SELECT DISTINCT geneids2,
                        {type_col}2 FROM interactions;'''.format(type_col = TYPE_COLUMN)

PLOS_PMC_DB = 'sqlite_data/data.plos-pmc.sqlite'
ALL_DB = 'sqlite_data/data.all.sqlite'



DELIMITER = "\t"


### Connect to Databases

In [None]:
conn_all = sqlite3.connect(ALL_DB)
cursor_all = conn_all.cursor()

### Create table

In [None]:
if DROP_AND_REMAKE:
    cursor_all.execute(DROP_SQL)
cursor_all.execute(SCHEMA)

    

### Gather Unique IDs into Hash Map
The index is the id, the value is an object.

In [None]:
data = {}

print("Executing SQL query. May take a minute.")
cursor = cursor_all.execute(QUERY_UNIQUE_IDS)
interactions = cursor.fetchall()
print("Query complete")

for row in log_progress(interactions, every=100, name="IDs added"):

    if row == None:
        continue

    
    id = row[0]
    dgr_type = row[1]

    if id not in data:
        data[id] = { TYPE_COLUMN : dgr_type}
        
print("Unique IDs added to data")

## Create lookup table for other values

In [None]:
id_map = {}

for id_file in PHARMGKB_ID_FILES:
    with open(id_file["filename"]) as file:
        try:
            header = None
            linenum = 0
            for line in file:
                linenum+=1
                pgkb = None
                ncbi = None
                mesh = None

                # deliminate the lines
                line = line.strip().split(DELIMITER)

                # Read the headers of the file and assign them to a dictionary {column_name: column_number}
                if linenum == 1:
                    header = {name.strip(): col for col, name in enumerate(line)}
                    print(header)                    
                    continue

                


                # set variables
                pgkb = line[header["PharmGKB Accession Id"]]

                if "NCBI Gene ID" in header:
                    ncbi = line[header["NCBI Gene ID"]]
                if 'External Vocabulary'in header:
                    if len(line) > header['External Vocabulary']: # If a line doesn't have data on the end, it wont be in the delimination
                        external = str(line[header['External Vocabulary']]).replace('"', "") # weird bug where quotes break regex
                        match = re.match('MESH:[0-9A-Za-z]+',external,re.IGNORECASE)
                        if match != None:
                            mesh = match.group(0).upper()

                # fill map   
                values = {PHARMGKB_VALUE: pgkb}                
                if ncbi is not None:
                    values[NCBI_VALUE] = ncbi
                if mesh is not None:
                    values[MESH_VALUE] = mesh
        
                id_map[pgkb] = values
                
                if ncbi is not None:
                    id_map[ncbi] = values
                if mesh is not None:
                    id_map[mesh] = values
                

        except Exception as e:
            print(line)
            raise e

In [None]:
# Just to test
print(json.dumps(data['1000']))

### Go through each ID and find all alternative values

In [None]:


def getType(string):
    if string[:2] in 'PA':
        return PHARMGKB_VALUE
    elif string[:5] in 'MESH:':
        return MESH_VALUE
    else:
        return NCBI_VALUE

count = 0
for value in log_progress(data, every=1000, name="IDs looked up "):

    try:
        ncbi_value = None
        pharmgkb_value = None
        mesh_value = None

        data_values = data.values()
        
        # Get ID type
        if value in id_map:
            map_data = id_map[value]
        

            # Lookup NCBI
            if NCBI_VALUE in map_data:
                ncbi_value = map_data[NCBI_VALUE]
                data[value][NCBI_VALUE] = ncbi_value

            # lookup PharmGKB
            if PHARMGKB_VALUE in map_data:
                pharmgkb_value = map_data[PHARMGKB_VALUE]
                data[value][PHARMGKB_VALUE] = pharmgkb_value

            # Lookup MESH
            if MESH_VALUE in map_data:
                mesh_value = map_data[MESH_VALUE]
                data[value][MESH_VALUE] = mesh_value

        if value not in data_values:
            data[value][getType(value)] = value
            # print(value, data[value])

        # If no alias found
        if len(data[value]) == 2:
            print(value, data[value])
        else:
            count+=1
    except Exception as e:
        print(value, str(e))
    

print("Total with aliases:", count, "/", len(data))

In [None]:
data["155060"]

### Add data to database

In [None]:
ALIASES_WRITE = '''INSERT OR IGNORE INTO {table} (
                {col1}, {col2}, {col3}, {col4}, {col5} )
                VALUES ( ? , ? , ? , ? , ? );'''.format(
            table = TABLE_NAME, 
            col1=MESH_VALUE,
            col2=PHARMGKB_VALUE,
            col3=NCBI_VALUE,
            col4=VALUES_COLUMN,
            col5=TYPE_COLUMN)

if '' in data:
    del data['']

for key in log_progress(data, every=1000, name="Added to DB"):
    try:
        values = data[key]
        values_str = json.dumps(values)
        values_type = values["type"]
        
        
        mesh_val = None
        pgkb_val = None
        ncbi_val = None
        
        if MESH_VALUE in values:
            mesh_val = values[MESH_VALUE]
        
        if PHARMGKB_VALUE in values:
            pgkb_val = values[PHARMGKB_VALUE]
            
        if NCBI_VALUE in values:
            ncbi_val = values[NCBI_VALUE]

    
        cursor_all.execute(ALIASES_WRITE,(mesh_val, pgkb_val, ncbi_val, values_str, values_type))        

    except Exception as e:
        print(key, values)
        raise e
if WRITE:
    conn_all.commit()
    
print("Data committed")

if WRITE:
    with open("all_aliases_for_mike.json", "w+") as file:
        file.write(json.dumps( data ))
        
print("file written")