In [34]:
import pandas as pd
import numpy as np

df = pd.read_csv("output/drug_disease.csv").replace(np.nan, '', regex=True)

In [35]:
df.columns

Index(['disease name', 'other names', 'RX/OTC', 'drug', 'generic name',
       'drug class', 'brand names', 'pregnancy label', 'CSA label'],
      dtype='object')

In [36]:
df_rx = df[df["RX/OTC"] != 'otc']

In [37]:
df_rx[:10]

Unnamed: 0,disease name,other names,RX/OTC,drug,generic name,drug class,brand names,pregnancy label,CSA label
0,abdominal distension prior to abdominal x ray,,rx,vasopressin,vasopressin,antidiuretic-hormones,vasostrict|pitressin,c,n
1,abdominal distension prior to abdominal x ray,,rx,vasostrict,vasopressin,antidiuretic-hormones,vasostrict,c,n
2,abdominal distension,abdominal bloating|bloating|meteorism,rx,bethanechol,bethanechol,miscellaneous-genitourinary-tract-agents,duvoid|urecholine,c,n
3,abdominal distension,abdominal bloating|bloating|meteorism,rx,urecholine,bethanechol,miscellaneous-genitourinary-tract-agents,duvoid|urecholine,c,n
4,abdominal distension,abdominal bloating|bloating|meteorism,rx,duvoid,bethanechol,miscellaneous-genitourinary-tract-agents,duvoid|urecholine,c,n
7,abdominal radiological procedure,abdominal radiological procedures,rx,vasopressin,vasopressin,antidiuretic-hormones,vasostrict|pitressin,c,n
8,abnormal uterine bleeding,bleedingdysfunctional uterine|dub|dysfunctiona...,rx,medroxyprogesterone,medroxyprogesterone,contraceptives,,x,n
9,abnormal uterine bleeding,bleedingdysfunctional uterine|dub|dysfunctiona...,rx,mirena,,contraceptives,,x,n
10,abnormal uterine bleeding,bleedingdysfunctional uterine|dub|dysfunctiona...,rx,provera,medroxyprogesterone,contraceptives,provera,x,n
11,abnormal uterine bleeding,bleedingdysfunctional uterine|dub|dysfunctiona...,rx,norethindrone,norethindrone,contraceptives,aygestin|camila|errin|jolivette|nora-be|ortho ...,x,n


### Generic names - RXNorm

In [38]:
# The generic name column contains lists that only have one string.
# Pull the string out of the list.
# lowercase
#
# import ast
#
# def _reveal(string):
#     s = ""
#     if ast.literal_eval(string):
#         s = ast.literal_eval(string)[0]
#     return s.lower()
#

# generic_names = df["generic name"].map(lambda x: _reveal(x))
#
# generic_names[:10]

# removed because disease-drug is cleaned already.

In [39]:
str_rxnorm_df = pd.read_csv("../mapping data/RXNORM-ingredient-base.csv", encoding = "latin", dtype="str")
str_rxnorm_df[:10]

Unnamed: 0,base,base_str,ingredient,ingredient_str
0,97,ticlopidine hydrochloride,10594,ticlopidine
1,309,adenosine monophosphate,296,adenosine
2,362,epinephrine hydrochloride,3992,epinephrine
3,388,polymyxin b sulfate,8536,polymyxin b
4,613,aluminum hydroxide gel,612,aluminum hydroxide
5,621,amantadine sulfate,620,amantadine
6,643,amikacin sulfate,641,amikacin
7,1115,trihexyphenidyl hydrochloride,10811,trihexyphenidyl
8,1219,atracurium besylate,1218,atracurium
9,1225,hyoscyamine sulfate,153970,hyoscyamine


In [40]:
# build a dictionary mapping from strings to rxnormsq
str_rxn_dict = dict(zip(str_rxnorm_df["ingredient_str"], str_rxnorm_df["ingredient"]))

#### Map generic names to rxnorms

In [41]:
def _unique(lst):
    '''returns a list with unique values'''
    return list(np.unique(np.array(lst)))

In [42]:
generic_names = df_rx["generic name"].to_list()
len(generic_names)

14624

In [43]:
generic_names_unique = generic_names
generic_names_unique = _unique(generic_names)
generic_names_unique.remove("")
len(generic_names_unique)

1996

In [57]:
mapped_num = 0
# map our data
generic_rxn_list = []
i = 0

prev_g_names = []
for generic_name in generic_names:
    rxn = []
    if generic_name in brand_str_rxn_dict:
        rxn.append(brand_str_rxn_dict[generic_name])

    # If it's a generic that has appeared, and the mapping is not empty
    if generic_name not in prev_g_names and rxn!=[]:
        mapped_num += 1
        prev_g_names.append(generic_name)
        
    generic_rxn_list.append("|".join(rxn))
    i += 1

#### Mapping rate from drug generic names to RXNorms


In [58]:
mapping_rate_str2rxn = mapped_num/len(generic_names_unique)
mapping_rate_str2rxn

0.037575150300601205

In [46]:
output_df = pd.read_csv("output/PheCode_rxNorm.csv").replace(np.nan, '', regex=True)
output_df = output_df.drop("Unnamed: 0", axis=1)
output_df[:10]

Unnamed: 0,disease name,other names,RX/OTC,drug,generic name,drug class,brand names,pregnancy label,CSA label,cui_from_generic,cui_from_brand
0,abdominal distension prior to abdominal x ray,,rx,vasopressin,vasopressin,antidiuretic-hormones,vasostrict|pitressin,c,n,C0003779|C0201849,
1,abdominal distension prior to abdominal x ray,,rx,vasostrict,vasopressin,antidiuretic-hormones,vasostrict,c,n,C0003779|C0201849,
2,abdominal distension,abdominal bloating|bloating|meteorism,rx,bethanechol,bethanechol,miscellaneous-genitourinary-tract-agents,duvoid|urecholine,c,n,C0053526,
3,abdominal distension,abdominal bloating|bloating|meteorism,rx,urecholine,bethanechol,miscellaneous-genitourinary-tract-agents,duvoid|urecholine,c,n,C0053526,
4,abdominal distension,abdominal bloating|bloating|meteorism,rx,duvoid,bethanechol,miscellaneous-genitourinary-tract-agents,duvoid|urecholine,c,n,C0053526,
5,abdominal radiological procedure,abdominal radiological procedures,rx,vasopressin,vasopressin,antidiuretic-hormones,vasostrict|pitressin,c,n,C0003779|C0201849,
6,abnormal uterine bleeding,bleedingdysfunctional uterine|dub|dysfunctiona...,rx,medroxyprogesterone,medroxyprogesterone,contraceptives,,x,n,C0025147,C1272460|C1705112
7,abnormal uterine bleeding,bleedingdysfunctional uterine|dub|dysfunctiona...,rx,mirena,,contraceptives,,x,n,C1272460|C1705112,C1272460|C1705112
8,abnormal uterine bleeding,bleedingdysfunctional uterine|dub|dysfunctiona...,rx,provera,medroxyprogesterone,contraceptives,provera,x,n,C0025147,C0699702
9,abnormal uterine bleeding,bleedingdysfunctional uterine|dub|dysfunctiona...,rx,norethindrone,norethindrone,contraceptives,aygestin|camila|errin|jolivette|nora-be|ortho ...,x,n,C0028356,


In [58]:
output_df["exact_rxNorm_generic"] = generic_rxn_list
output_df.to_csv("output/PheCode_rxNorm.csv", index=False)

In [59]:
# # build a frame
# # store the disease-cui-phe  data
# str_rxn_df = pd.DataFrame()
#
# str_rxn_df["disease name"] = df["disease name"].to_list()
#
# str_rxn_df["generic name"] = df["generic name"].to_list()
# str_rxn_df["generic name"] = str_rxn_df["generic name"].apply(lambda x: "|".join(ast.literal_eval(x)))
#
# str_rxn_df["generic rx norms"] = generic_rxn_list
# str_rxn_df["generic rx norms"] = str_rxn_df["generic rx norms"].apply(lambda x: x[0] if len(x)>0 else "")
# str_rxn_df[:20]


### Brand names - RXNorm

In [47]:
# import ast
# def _reveal_brandnames(string):
#     li = []
#     list_string = ast.literal_eval(string)
#     if list_string:
#         for s in list_string:
#             li.append(s.lower())
#     return li
#
brand_names = df_rx["brand names"]
brand_names[:20]

0                                  vasostrict|pitressin
1                                            vasostrict
2                                     duvoid|urecholine
3                                     duvoid|urecholine
4                                     duvoid|urecholine
7                                  vasostrict|pitressin
8                                                      
9                                                      
10                                              provera
11    aygestin|camila|errin|jolivette|nora-be|ortho ...
12                                                     
13                                     megace es|megace
14    estarylla|femynor|mono-linyah|mononessa|ortho ...
15                                                     
16                                                     
17    aygestin|camila|errin|jolivette|nora-be|ortho ...
18    amethyst|aviane|balcoltra|falmina|levlen|lillo...
19                                              

In [48]:
brand_names_unique = brand_names
brand_names_unique = _unique(brand_names)
brand_names_unique.remove("")
len(brand_names_unique)

1994

In [49]:
str_rxnorm_df["base_str"]

0                           ticlopidine hydrochloride
1                             adenosine monophosphate
2                           epinephrine hydrochloride
3                                 polymyxin b sulfate
4                              aluminum hydroxide gel
                             ...                     
94440             cytarabine / daunorubicin injection
94441              amphotericin b injectable solution
94442                       amikacin inhalant product
94443    cytarabine / daunorubicin injectable product
94444    cytarabine / daunorubicin injectable product
Name: base_str, Length: 94445, dtype: object

In [50]:
# create a dictionary mapping from brand names to rx norms
brand_str_rxn_dict = dict(zip(str_rxnorm_df["base_str"], str_rxnorm_df["base"]))

In [55]:
# map them
mapped_num = 0
# map our data
brand_rxn = []
unique_rxn = []

previous_brand_names = []
i = 0
for brand_name in brand_names:
    cur_rxns = []
    for name in brand_name.split("|"):
        if name in brand_str_rxn_dict:
            cur_rxns.append(brand_str_rxn_dict[name])

    if brand_names not in previous_brand_names:
        if len(cur_rxns) > 0:
            mapped_num += 1
            unique_rxn.append(brand_str_rxn_dict[name])
        previous_brand_names.append(brand_names)
    
    brand_rxn.append(cur_rxns)
    i += 1

#### Mapping rate from brand names to RXNorms

In [56]:
mapping_rate_str2rxn_brand = mapped_num/len(brand_names_unique)
mapping_rate_str2rxn_brand

0.0

In [17]:
unique_rxn

[]