In [1]:
import pandas as pd
import numpy as np

df = pd.read_csv("drugs.com/drug_disease.csv")
df = df[df["RX/OTC"] != 'OTC']


In [2]:
# get rid of diseases with no corresponding drugs
df = df[df["generic name"] != "[]"]
df[:10]

Unnamed: 0.1,Unnamed: 0,disease name,other names,RX/OTC,drug,generic name,drug class,brand names,pregnacy label,CSA label
0,1,abdominal distension prior to abdominal x ray,[],Rx,vasopressin,['vasopressin'],['antidiuretic-hormones'],"['Vasostrict', 'Pitressin']",C,N
1,2,abdominal distension prior to abdominal x ray,[],Rx,Vasostrict,['vasopressin'],['antidiuretic-hormones'],['Vasostrict'],C,N
2,3,abdominal distension,['Abdominal bloating; Bloating; Meteorism'],Rx,bethanechol,['bethanechol'],['miscellaneous-genitourinary-tract-agents'],"['Duvoid', 'Urecholine']",C,N
3,4,abdominal distension,['Abdominal bloating; Bloating; Meteorism'],Rx,Urecholine,['bethanechol'],['miscellaneous-genitourinary-tract-agents'],"['Duvoid', 'Urecholine']",C,N
4,5,abdominal distension,['Abdominal bloating; Bloating; Meteorism'],Rx,Duvoid,['bethanechol'],['miscellaneous-genitourinary-tract-agents'],"['Duvoid', 'Urecholine']",C,N
7,8,abdominal radiological procedure,['Abdominal Radiological Procedures'],Rx,vasopressin,['vasopressin'],['antidiuretic-hormones'],"['Vasostrict', 'Pitressin']",C,N
8,12,abnormal uterine bleeding,"['Bleeding, dysfunctional uterine; DUB; Dysfun...",Rx,medroxyprogesterone,['medroxyprogesterone'],['contraceptives'],[],X,N
10,14,abnormal uterine bleeding,"['Bleeding, dysfunctional uterine; DUB; Dysfun...",Rx,Provera,['medroxyprogesterone'],['contraceptives'],['Provera'],X,N
11,15,abnormal uterine bleeding,"['Bleeding, dysfunctional uterine; DUB; Dysfun...",Rx,norethindrone,['norethindrone'],['contraceptives'],"['Aygestin', 'Camila', 'Errin', 'Jolivette', '...",X,N
12,16,abnormal uterine bleeding,"['Bleeding, dysfunctional uterine; DUB; Dysfun...",Rx,Depo-Provera,['medroxyprogesterone'],['contraceptives'],[],X,N


### Generic names - RXNorm

In [3]:
# The generic name column contains lists that only have one string.
# Pull the string out of the list.
# lowercase

import ast

def _reveal(string):
    s = ""
    if ast.literal_eval(string):
        s = ast.literal_eval(string)[0]
    return s.lower()

generic_names = df["generic name"].map(lambda x: _reveal(x))

generic_names[:10]

0             vasopressin
1             vasopressin
2             bethanechol
3             bethanechol
4             bethanechol
7             vasopressin
8     medroxyprogesterone
10    medroxyprogesterone
11          norethindrone
12    medroxyprogesterone
Name: generic name, dtype: object

In [6]:
str_rxnorm_df = pd.read_csv("mapping data/RXNORM-ingredient-base.csv", encoding = "latin", dtype="str")
str_rxnorm_df[:10]

Unnamed: 0,base,base_str,ingredient,ingredient_str
0,97,ticlopidine hydrochloride,10594,ticlopidine
1,309,adenosine monophosphate,296,adenosine
2,362,epinephrine hydrochloride,3992,epinephrine
3,388,polymyxin b sulfate,8536,polymyxin b
4,613,aluminum hydroxide gel,612,aluminum hydroxide
5,621,amantadine sulfate,620,amantadine
6,643,amikacin sulfate,641,amikacin
7,1115,trihexyphenidyl hydrochloride,10811,trihexyphenidyl
8,1219,atracurium besylate,1218,atracurium
9,1225,hyoscyamine sulfate,153970,hyoscyamine


In [7]:
# build a dictionary mapping from strings to rxnormsq
str_rxn_dict = dict(zip(str_rxnorm_df["ingredient_str"], str_rxnorm_df["ingredient"]))

#### Map generic names to rxnorms

In [8]:
mapped_num = 0
# map our data
generic_rxn_list = []
i = 0 
for generic_name in generic_names:
    rxn = []
    if generic_name in str_rxn_dict:
        rxn.append(str_rxn_dict[generic_name])

    if rxn != []:
        mapped_num += 1
        
    generic_rxn_list.append(rxn)
    i += 1

#### Mapping rate from drug generic names to RXNorms

In [9]:
mapping_rate_str2rxn = mapped_num/len(generic_names)
mapping_rate_str2rxn

0.6835752263256215

In [10]:
len(generic_rxn_list)

13918

In [11]:
# build a frame
# store the disease-cui-phe  data
str_rxn_df = pd.DataFrame()

str_rxn_df["disease name"] = df["disease name"].to_list()

str_rxn_df["generic name"] = df["generic name"].to_list()
str_rxn_df["generic name"] = str_rxn_df["generic name"].apply(lambda x: "|".join(ast.literal_eval(x)))

str_rxn_df["generic rx norms"] = generic_rxn_list
str_rxn_df["generic rx norms"] = str_rxn_df["generic rx norms"].apply(lambda x: x[0] if len(x)>0 else "")
str_rxn_df[:20]


Unnamed: 0,disease name,generic name,generic rx norms
0,abdominal distension prior to abdominal x ray,vasopressin,
1,abdominal distension prior to abdominal x ray,vasopressin,
2,abdominal distension,bethanechol,19257.0
3,abdominal distension,bethanechol,19257.0
4,abdominal distension,bethanechol,19257.0
5,abdominal radiological procedure,vasopressin,
6,abnormal uterine bleeding,medroxyprogesterone,6691.0
7,abnormal uterine bleeding,medroxyprogesterone,6691.0
8,abnormal uterine bleeding,norethindrone,7514.0
9,abnormal uterine bleeding,medroxyprogesterone,6691.0


### Brand names - RXNorm

In [12]:
def _reveal_brandnames(string):
    li = []
    list_string = ast.literal_eval(string)
    if list_string:
        for s in list_string:
            li.append(s.lower())
    return li

brand_name_list = df["brand names"].map(lambda x: _reveal_brandnames(x))
brand_name_list[:20]

0                               [vasostrict, pitressin]
1                                          [vasostrict]
2                                  [duvoid, urecholine]
3                                  [duvoid, urecholine]
4                                  [duvoid, urecholine]
7                               [vasostrict, pitressin]
8                                                    []
10                                            [provera]
11    [aygestin, camila, errin, jolivette, nora-be, ...
12                                                   []
13                                  [megace es, megace]
14    [estarylla, femynor, mono-linyah, mononessa, o...
15                                                   []
16                                                   []
17    [aygestin, camila, errin, jolivette, nora-be, ...
18    [amethyst, aviane, balcoltra, falmina, levlen,...
19                                                   []
20                                              

In [13]:
# create a dictionary mapping from brand names to rx norms
brand_str_rxn_dict = dict(zip(str_rxnorm_df["base_str"], str_rxnorm_df["base"]))

In [14]:
# map them
mapped_num = 0
# map our data
brand_rxn_list = []
i = 0 
for brand_names in brand_name_list:
    cur_rxns = []
    for brand_name in brand_names:
        if brand_name in brand_str_rxn_dict:
            print(brand_name)
            cur_rxns.append(brand_str_rxn_dict[brand_name])    

    if len(cur_rxns) > 0:
        mapped_num += 1
    
    brand_rxn_list.append(cur_rxns)
    i += 1

vecuronium bromide
testosterone cypionate
testosterone enanthate
testosterone cypionate
testosterone enanthate
colesevelam hydrochloride
timolol maleate
timolol maleate
ganirelix acetate
colesevelam hydrochloride
colesevelam hydrochloride
testosterone cypionate
testosterone enanthate
testosterone cypionate
testosterone enanthate
testosterone cypionate
testosterone enanthate
timolol maleate
timolol maleate
methotrexate sodium
methotrexate sodium
testosterone cypionate
testosterone enanthate


#### Mapping rate from brand names to RXNorms

In [15]:
mapping_rate_str2rxn_brand = mapped_num/len(brand_name_list)
mapping_rate_str2rxn_brand

0.0010777410547492456

In [16]:
# add to the frame
str_rxn_df["brand rxnorms"] = brand_rxn_list


In [17]:
str_rxn_df["brand rxnorms"] = str_rxn_df["brand rxnorms"].apply(lambda x: x[0] if len(x)>0 else "")

In [18]:
str_rxn_df[:20]

Unnamed: 0,disease name,generic name,generic rx norms,brand rxnorms
0,abdominal distension prior to abdominal x ray,vasopressin,,
1,abdominal distension prior to abdominal x ray,vasopressin,,
2,abdominal distension,bethanechol,19257.0,
3,abdominal distension,bethanechol,19257.0,
4,abdominal distension,bethanechol,19257.0,
5,abdominal radiological procedure,vasopressin,,
6,abnormal uterine bleeding,medroxyprogesterone,6691.0,
7,abnormal uterine bleeding,medroxyprogesterone,6691.0,
8,abnormal uterine bleeding,norethindrone,7514.0,
9,abnormal uterine bleeding,medroxyprogesterone,6691.0,


## Broad new full dict mapping

In [21]:
broad_dict_raw = pd.read_csv("mapping data/broad_new_full_dict.txt", sep="|", header=None)

# names columns for extracting
broad_dict_raw.columns = ["name","1",'2',"cui",'4','5'] 

  interactivity=interactivity, compiler=compiler, result=result)


In [22]:
string = broad_dict_raw["name"].apply(lambda x: x.lower() if type(x) == str else "")
str_cui_dic = dict(zip(string, broad_dict_raw["cui"]))

In [23]:
def _unique(lst):
    '''returns a list with unique values'''
    return list(np.unique(np.array(lst)))

In [24]:
mapped_num = 0
# map our data
cui_codes_list = []
i = 0 
for generic_name in generic_names:
    cur_cuis = []
    if generic_name in str_cui_dic:
         cur_cuis.append(str_cui_dic[generic_name])
    cur_cuis = _unique(cur_cuis)
    
    if len(cur_cuis) > 0:
        mapped_num += 1
        
    cui_codes_list.append(cur_cuis)
    i += 1

#### Mapping rate of generic names to CUI

In [25]:
mapping_rate_str2cui = mapped_num/len(generic_names)
mapping_rate_str2cui

0.8308665038080184

### CUI to RXNorms