In [1]:
import os
import pickle
%matplotlib inline
import pandas as pd
import seaborn as sns
from tqdm import tqdm
import matplotlib.pyplot as plt

# Creating an indication Gold Standard in UMLS CUI space

### Relationships

In [2]:
rels = pd.read_csv('../data/drugcentral_rel.csv')
rels.head(2)

Unnamed: 0,id,struct_id,concept_id,relationship_name,concept_name,umls_cui,snomed_full_name,cui_semantic_type,snomed_conceptid
0,132463,1253,21000041,indication,Tuberculosis,C0041296,Tuberculosis,T047,56717001.0
1,132464,5203,21000533,indication,Malignant tumor of ovary,C1140680,Malignant tumor of ovary,T191,363443007.0


In [3]:
rels['relationship_name'].value_counts()

contraindication    27547
indication          10875
off-label use        2442
reduce risk             1
Name: relationship_name, dtype: int64

In [4]:
rels.query('relationship_name == "contraindication"').head(5)

Unnamed: 0,id,struct_id,concept_id,relationship_name,concept_name,umls_cui,snomed_full_name,cui_semantic_type,snomed_conceptid
166,132629,5022,40249237,contraindication,Decreased liver function,C0232744,Decreased liver function,T033,77981007.0
252,132714,4976,21000873,contraindication,Idiopathic thrombocytopenic purpura,C0043117,Idiopathic thrombocytopenic purpura,T047,32273002.0
260,132722,4978,40249212,contraindication,Sepsis,C0243026,Sepsis,T047,91302008.0
377,132838,4881,21001119,contraindication,Narcolepsy,C0027404,Narcolepsy,T047,60380001.0
385,132846,1917,21001709,contraindication,Low blood pressure,C0020649,Low blood pressure,T033,45007003.0


In [5]:
rels.query('relationship_name == "indication"')['struct_id'].nunique()

2317

In [6]:
rels.query('relationship_name == "indication"')['concept_id'].nunique()

2234

### IDs

In [7]:
ids = pd.read_csv('../data/drugcentral_ids.csv')
ids.head(2)

Unnamed: 0,id,identifier,id_type,struct_id,parent_match
0,1056309,DB00001,DRUGBANK_ID,2995,
1,1056310,DB00002,DRUGBANK_ID,4954,


In [8]:
id_cols = ['identifier', 'id_type', 'struct_id'] #only need the information from these ids

### Synonyms

In [9]:
syn = pd.read_csv('../data/drugcentral_syn.csv')
syn.head(2)

Unnamed: 0,syn_id,id,name,preferred_name,parent_id,lname
0,22226,2210.0,piroxicam olamine,,,piroxicam olamine
1,22227,,Blood-coagulation factor VIII,1.0,221.0,blood-coagulation factor viii


In [10]:
# Rename to match other data sources column names
syn.rename(columns={'id': 'struct_id'}, inplace=True)
syn_cols = ['struct_id', 'name'] # only interested in these two columns

## UMLS CUIs

In [11]:
# What percentage (if any) of lines in the relactionships table do not have a umls_cui for the disease?
'{0:%}'.format((len(rels) - rels['umls_cui'].count()) / len(rels))

'9.582773%'

May have to fix later... 

Lets look at compounds

In [12]:
# number of total compounds with a relationship
rels['struct_id'].nunique()

2388

In [13]:
# number of total compounds with an id
ids['struct_id'].nunique()

4483

In [14]:
# number of total compounds with a preferred name
syn.query('preferred_name == 1')['struct_id'].nunique()

4486

In [15]:
ids['id_type'].unique()

array(['DRUGBANK_ID', 'IUPHAR_LIGAND_ID', 'KEGG_DRUG',
       'MESH_DESCRIPTOR_UI', 'VUID', 'NUI', 'UMLSCUI',
       'MESH_SUPPLEMENTAL_RECORD_UI', 'ChEMBL_ID', 'UNII',
       'SECONDARY_CAS_RN', 'INN_ID', 'CHEBI', 'PUBCHEM_CID', 'RXNORM',
       'NDDF', 'VANDF', 'MMSL', 'NDFRT', 'SNOMEDCT_US', 'PDB_CHEM_ID'], dtype=object)

In [16]:
ids.query('id_type == "UMLSCUI"')['struct_id'].nunique() / ids['struct_id'].nunique()

0.6330582199420032

only about 63% of compunds have a UMLS CUI in the database

In [17]:
# How many structures in the relationships table don't have an associated UMLSCUI?
len(set(rels['struct_id'].unique()) - set(ids.query('id_type == "UMLSCUI"')['struct_id'].unique()))

406

In [18]:
no_cui = list(set(rels['struct_id'].unique()) - set(ids.query('id_type == "UMLSCUI"')['struct_id'].unique()))

In [19]:
syn.query('struct_id in @no_cui and preferred_name == 1').sort_values('struct_id').head()

Unnamed: 0,syn_id,struct_id,name,preferred_name,parent_id,lname
5622,3826,6.0,(S)-nitrendipine,1.0,,(s)-nitrendipine
11520,9884,49.0,acefylline,1.0,,acefylline
12302,10675,55.0,acetarsol,1.0,,acetarsol
4347,2520,72.0,acetylpheneturide,1.0,,acetylpheneturide
4195,2364,77.0,acitazanolast,1.0,,acitazanolast


In [20]:
import requests
from pyquery import PyQuery as pq

In [21]:
def handshake():
    # insane umls api instructions
    # https://documentation.uts.nlm.nih.gov/rest/authentication.html
    r = requests.post("https://utslogin.nlm.nih.gov/cas/v1/api-key", data={'apikey': TICKET})
    d = pq(r.text)
    tgt = d.find('form').attr('action')
    return tgt

In [22]:
def name_to_umls(name, tgt):
    """
    given a name, get the umls cui
    """
    data = {'service': 'http://umlsks.nlm.nih.gov'}
    r = requests.post(tgt, data=data)
    st = r.text
    url = "https://uts-ws.nlm.nih.gov/rest/search/current?string={}&ticket={}"
    d = requests.get(url.format(name, st)).json()
    for res in d['result']['results']:
        if res['name'].lower() == name.lower():
            return res['uri'].split('/')[-1]
    
    return float('NaN')

In [23]:
with open('../data/api.key', 'r') as fin: 
    TICKET = fin.read().rstrip()

In [24]:
need_cui = syn.query('struct_id in @no_cui and preferred_name == 1').sort_values('struct_id').reset_index(drop=True)

In [25]:
%%time
tgt = handshake()
need_cui['UMLSCUI'] = need_cui['name'].apply(lambda n: name_to_umls(n, tgt))

CPU times: user 11 s, sys: 412 ms, total: 11.4 s
Wall time: 4min 16s


In [26]:
"{:%} of compound were able to obtain a CUI".format(need_cui['UMLSCUI'].count() / len(need_cui))

'85.714286% of compound were able to obtain a CUI'

May be some parenthesis issues sill, lets look at them and see if htey can be fixed

In [27]:
need_cui[need_cui['name'].str.contains('(', regex=False)]

Unnamed: 0,syn_id,struct_id,name,preferred_name,parent_id,lname,UMLSCUI
0,3826,6.0,(S)-nitrendipine,1.0,,(s)-nitrendipine,
265,19413,5041.0,technetium (99mTc) nofetumomab merpentan,1.0,,technetium (99mtc) nofetumomab merpentan,
272,19612,5053.0,amino(diphenylhydantoin) valeric acid,1.0,,amino(diphenylhydantoin) valeric acid,C3652632
325,21193,5124.0,technetium (99mTc) sestamibi,1.0,,technetium (99mtc) sestamibi,
345,21499,5153.0,fluciclovine (18F),1.0,,fluciclovine (18f),
350,21596,5159.0,eptacog alfa (activated),1.0,,eptacog alfa (activated),


### Start to put things together

In [28]:
pref = syn.query('preferred_name == 1').reset_index(drop=True)
pref = pref.dropna(subset=['struct_id'])
pref['struct_id'] = pref['struct_id'].astype('int64')
cui_map = ids.query('id_type == "UMLSCUI"')[['struct_id', 'identifier']].set_index('struct_id')['identifier'].to_dict()
pref['UMLSCUI'] = pref['struct_id'].apply(lambda i: cui_map.get(i, float('NaN')))

In [29]:
pref = pref.dropna(subset=['UMLSCUI'])

In [30]:
new_syn = pd.concat([pref, need_cui.dropna(subset=['UMLSCUI']).reset_index(drop=True)]).reset_index(drop=True)

In [31]:
still_need_cui = need_cui[need_cui['UMLSCUI'].isnull()].reset_index(drop=True)

In [32]:
%%time
tgt = handshake()
still_need_cui['UMLSCUI'] = still_need_cui['name'].apply(lambda n: name_to_umls(n, tgt))

CPU times: user 1.57 s, sys: 76 ms, total: 1.65 s
Wall time: 35.5 s


In [33]:
'{:%} of remaining were to find CUI'.format(still_need_cui['UMLSCUI'].count() / len(still_need_cui))

'0.000000% of remaining were to find CUI'

In [34]:
new_syn = pd.concat([new_syn, still_need_cui.dropna(subset=['UMLSCUI']).reset_index(drop=True)]).reset_index(drop=True)

In [35]:
syn_cols = ['struct_id', 'name', 'UMLSCUI']

In [36]:
rel_with_cui = pd.merge(rels, new_syn[syn_cols], how='left', on='struct_id')

In [37]:
"{:%} of relationships now have a UMLSCUI for the compound".format(rel_with_cui['UMLSCUI'].count() / rel_with_cui.shape[0])

'99.341735% of relationships now have a UMLSCUI for the compound'

### Add in approval Dates

In [38]:
app = pd.read_csv('../data/drugcentral_approvals.csv')
app.head()

Unnamed: 0,id,struct_id,approval,type,applicant,orphan
0,3578,5204,2009-06-30,FDA,AMAG PHARMS INC,
1,3579,5200,,FDA,,
2,3580,5198,,FDA,,
3,3535,5201,2016-12-14,FDA,ANACOR PHARMS INC,
4,3536,5203,2016-12-19,FDA,"Clovis Oncology, Inc.",


In [39]:
app = (app.dropna(subset=['approval']) # Remove NaN values
          .sort_values('approval')     # Put the earliest approval first
          .groupby('struct_id')        # Group by the compound's id
          .first()                     # And select the first instance of that id
          .reset_index())              # Return struct_id to a column from the index

In [40]:
rel_with_cui = pd.merge(rel_with_cui, app[['struct_id', 'approval']], how='left', on='struct_id')
rel_with_cui.head(2)

Unnamed: 0,id,struct_id,concept_id,relationship_name,concept_name,umls_cui,snomed_full_name,cui_semantic_type,snomed_conceptid,name,UMLSCUI,approval
0,132463,1253,21000041,indication,Tuberculosis,C0041296,Tuberculosis,T047,56717001.0,,,
1,132464,5203,21000533,indication,Malignant tumor of ovary,C1140680,Malignant tumor of ovary,T191,363443007.0,rucaparib,C3661315,2016-12-19


In [41]:
rel_columns = ['struct_id', 'UMLSCUI', 'name', 'relationship_name', 'concept_id', 'umls_cui', 'concept_name', 'cui_semantic_type', 'approval']
rel_rename = {'struct_id': 'dc_struct_id',
              'UMLSCUI': 'compound_umlscui', 
              'name': 'compound_name', 
              'relationship_name': 'relationship', 
              'concept_id': 'dc_disease_id', 
              'umls_cui': 'disease_umlscui', 
              'concept_name': 'disease_name', 
              'cui_semantic_type': 'disease_umls_semantic_type',
              'approval': 'date_approved'}

rel_with_cui = rel_with_cui[rel_columns].rename(columns = rel_rename)

In [42]:
rel_with_cui.head(2)

Unnamed: 0,dc_struct_id,compound_umlscui,compound_name,relationship,dc_disease_id,disease_umlscui,disease_name,disease_umls_semantic_type,date_approved
0,1253,,,indication,21000041,C0041296,Tuberculosis,T047,
1,5203,C3661315,rucaparib,indication,21000533,C1140680,Malignant tumor of ovary,T191,2016-12-19


In [43]:
rel_with_cui = rel_with_cui.dropna(subset=['compound_umlscui', 'disease_umlscui'])
rel_with_cui.to_csv('../data/umls_formatted_drugcentral_relationships.csv', index=False)

In [44]:
indications = rel_with_cui.query('relationship == "indication"').reset_index(drop=True)

In [45]:
print('Indications: ', len(indications))
print('Number of compounds: ', indications['compound_umlscui'].nunique())
print('Number of diseases: ', indications['disease_umlscui'].nunique())
print('Indications with dates: ', indications['date_approved'].count())

Indications:  8270
Number of compounds:  2170
Number of diseases:  1308
Indications with dates:  7074


In [46]:
indications.to_csv('../data/indications.csv', index=False)