In [1]:
import pandas as pd
import pickle
import ast
raw_df = pd.read_csv("all_drugs_ade_indications_updated.csv")
all_compos = None
with open('dailymed_all_drugs.pickle',"rb") as f:
    all_compos = pickle.load(f)

In [16]:
def remove_dupes(list_dict):
    if type(list_dict) != list:
        return []
    code_ids = set()
    new_list = []
    for el_dict in list_dict:
        if el_dict['code'] not in code_ids:
            new_list.append(el_dict)
            code_ids.add(el_dict['code'])
    return new_list

def clean_column(col):
    return [ remove_dupes(ast.literal_eval(str(el))) if str(el) != 'nan' else None for el in col.values]

def clean_columns(old_df,cols):
    df = old_df.dropna(subset=['drug','form','roa']).copy()
    for col in cols:
        df[col] = clean_column(df[col])
    return df

In [17]:
xml_lookup_compos = {xml_file : compos for xml_file, compos in all_compos}

In [18]:
%pdb off
df = clean_columns(raw_df,['active_ingredient', 'form', 'roa'])

Automatic pdb calling has been turned OFF


In [19]:
df['compos'] = [xml_lookup_compos[fn] for fn in df['filename']]

In [20]:
df['compos'].values

array([list([('ETHINYL ESTRADIOL', '423D2T571U ||| 2.16.840.1.113883.4.9', '0.025 mg'), ('NORETHINDRONE', 'T18F433X4S ||| 2.16.840.1.113883.4.9', '0.8 mg')]),
       list([('ENTECAVIR', '5968Y6H45M ||| 2.16.840.1.113883.4.9', '0.5 mg')]),
       list([('POTASSIUM BICARBONATE', 'HM5Z15LEBN ||| 2.16.840.1.113883.4.9', '25 meq')]),
       ...,
       list([('Amphetamine Sulfate', '6DPV8NK46S ||| 2.16.840.1.113883.4.9', '5 mg')]),
       list([('SILDENAFIL CITRATE', 'BW9B0ZE037 ||| 2.16.840.1.113883.4.9', '20 mg')]),
       list([('NORETHINDRONE', 'T18F433X4S ||| 2.16.840.1.113883.4.9', '1 mg'), ('ETHINYL ESTRADIOL', '423D2T571U ||| 2.16.840.1.113883.4.9', '0.035 mg')])],
      dtype=object)

In [21]:
df.iloc[0]

Unnamed: 0                                                             0
author                                        Mylan Pharmaceuticals Inc.
drug                   NORETHINDRONE AND ETHINYL ESTRADIOL AND FERROU...
active_ingredient      [{'name': 'ETHINYL ESTRADIOL', 'code': '423D2T...
code                                                           0378-7308
filename               prescription/20180315_c89388f0-ffe4-4058-a644-...
form                                 [{'name': 'KIT', 'code': 'C47916'}]
roa                                 [{'name': 'ORAL', 'code': 'C38288'}]
adverse_events_list    ['Serious cardiovascular events and smoking \n...
adverse_events         6 ADVERSE REACTIONS\nThe most common adverse r...
indications            1 INDICATIONS AND USAGE\nNorethindrone and eth...
active_ingredients                                                   NaN
numerator_name                                                       NaN
numerator_val                                      

In [22]:
df.iloc[0]['roa']

[{'name': 'ORAL', 'code': 'C38288'}]

In [23]:
def construct_drug_name(row):
    form = row['form'][0]['name']
    roa = row['roa'][0]['name']
    start = ' | '.join([f'{drug} {strength}' for drug, _, strength in row['compos']])
    drug_name = start
    if form is not None:
        drug_name += f" WITH FORM OF {form}"
    if roa is not None:
        drug_name += f" WITH ROA OF {roa}"
    return drug_name.upper()

In [56]:
def export_vocab(df,column,term):
    mylist = []
    for val in df[column]:
        if type(val) == list:
            for el in val:
                 mylist.append((el['code'],el['name']))
    new_df = pd.DataFrame(mylist)
  
    new_df.columns = ['dailymed_code','dailymed_label']
    new_df = new_df.sort_values('dailymed_label')
    filename = f"export/dailymed_{term}_en.csv"
    new_df.to_csv(filename,index=False)

In [57]:
def sc(somelist,key=lambda el: el):
    return sorted(list(set(somelist)),key=key)

def make_df_from_sc(sorted_list):
    df = pd.DataFrame()
    df['dailymed_code'] = [i for i in range(len(sorted_list))]
    df['dailymed_label'] = [el.upper() for el in sorted_list]
    return df
def construct_vocab_for_compos(compos):
    unique_compos = {el for _, val in compos for el in val}
    unique_compos_str = sc({f"{el[0]} {el[2]}" for el in unique_compos})
    unique_strengths = sc({el[2] for el in unique_compos})
    unique_active_ingredients = sc({el[0] for el in unique_compos})
    return unique_compos_str, unique_strengths, unique_active_ingredients

names = ['component','strength','active_ingredient']
corresponding_dfs = [make_df_from_sc(el) for el in construct_vocab_for_compos(all_compos)]


In [58]:
for name, corresponding_df in zip(names,corresponding_dfs):
    corresponding_df.to_csv(f'export/dailymed_{name}_en.csv',index=False)

In [59]:
corresponding_dfs[0]

Unnamed: 0,dailymed_code,dailymed_label
0,0,".ALPHA.-TOCOPHEROL SUCCINATE, D- 30 [IU]"
1,1,.ALPHA.-TOCOPHEROL 10 [IU]
2,2,.ALPHA.-TOCOPHEROL 15 [IU]
3,3,.ALPHA.-TOCOPHEROL 20 [IU]
4,4,.ALPHA.-TOCOPHEROL 30 [IU]
...,...,...
10004,10004,ZOLMITRIPTAN 2.5 MG
10005,10005,ZOLPIDEM TARTRATE 1.75 MG
10006,10006,ZOLPIDEM TARTRATE 12.5 MG
10007,10007,ZOLPIDEM TARTRATE 5 MG


In [60]:
corresponding_dfs[1]

Unnamed: 0,dailymed_code,dailymed_label
0,0,.00375 1 / 100 G
1,1,.005 MG / 1 G
2,2,.006 MG
3,3,.0065 MG / 5 ML
4,4,.0091 MG / 1 ML
...,...,...
2747,2747,996 ML / 1 L
2748,2748,997 ML / 1 L
2749,2749,998 ML / 1 L
2750,2750,999 L / 1000 L


In [61]:
df.columns

Index(['Unnamed: 0', 'author', 'drug', 'active_ingredient', 'code', 'filename',
       'form', 'roa', 'adverse_events_list', 'adverse_events', 'indications',
       'active_ingredients', 'numerator_name', 'numerator_val',
       'denominator_name', 'denominator_val', 'compos', 'drug_name',
       'drug_name_with_code'],
      dtype='object')

In [62]:
df['drug_name'] = [construct_drug_name(row) for _, row in df.iterrows()]

In [63]:
df['drug_name_with_code'] = [[{'name' : row['drug_name'], 'code' : row['code']}] for _, row in df.iterrows()]

In [64]:
pairings = [
    ('drug_name_with_code', 'drug'),
    ('form', 'form'),
    ('roa','roa'),
]

In [65]:
%pdb on
for col_name, term in pairings:
    export_vocab(df,col_name,term)

Automatic pdb calling has been turned ON


In [49]:
!code .