In [1]:
!mkdir data

In [1]:
import os
queries = ['IN', 'PIN', 'DF','SCD','SCDC']
for query in queries:
    os.system(f'wget "http://localhost:8080/SnomedDLQuery?message={query}" -O "data/{query}.json"')

In [2]:
import json
import os
import ast
import pandas as pd

def load_wget_results():
    def helper(filename):
        fullpath = f"data/{filename}"
        with open(fullpath,"r") as f:
            content = f.read()[1:-1]
#             import ipdb; ipdb.set_trace()
            obj = ast.literal_eval(content)
            return pd.DataFrame(obj)
    return {filename: helper(filename) for filename in os.listdir("data") if filename.endswith(".json")}
   

In [3]:
dfs = load_wget_results()

In [4]:
dfs['ALL_IN.json'] = pd.concat([dfs['IN.json'],dfs['PIN.json']])

In [5]:
for df in dfs.values():
    df.columns = ['iri', 'prefLabel']

In [6]:
dfs['DF.json']

Unnamed: 0,iri,prefLabel
0,RxNorm#316982,Topical Cream
1,RxNorm#316983,Topical Lotion
2,RxNorm#2284289,Inhalation Spray
3,RxNorm#316984,Topical Oil
4,RxNorm#317678,Enema
...,...,...
118,RxNorm#10652,Toothpaste
119,RxNorm#858080,Buccal Film
120,RxNorm#11107,Vaginal Foam
121,RxNorm#7670,Ophthalmic Solution


In [7]:
def clean_df(df):
    new_df = df[['iri','prefLabel']].copy()
    new_df['prefLabel'] = new_df['prefLabel'].str.upper()
    new_df.columns = ['rxnorm_code','rxnorm_label']
    return new_df.dropna().drop_duplicates().sort_values('rxnorm_label')

# def clean_form_snomed(raw_form_df):
#     mapping = {
#         "EYE" : "OPHTALMIC",
#         "EAR" : "OTIC",
#         "NOSE" : "NASAL"
#     }
#     def sub_form_snomed_helper(form):
#         return ' '.join([mapping.get(el,el) for el in form.split(" ")])
#     form_df = raw_form_df.copy()
#     form_df['old_snomed_label'] = form_df['snomed_label'].copy()
#     form_df['snomed_label'] = [sub_form_snomed_helper(el.replace(" DOSE FORM","")) for el in form_df['old_snomed_label'].values]
#     return form_df

def clean_form_ocrx(raw_form_df):
    def clean_form_ocrx_helper(el):
        # removes the word in and inverts it
        if " IN " in el:
            splitted = el.split(" IN ")
            if len(splitted) > 2:
                import ipdb; ipdb.st_trace()
            start, last = splitted[:2]
            return last + " " + start
        else:
            return el
            
    form_df = raw_form_df.copy()
#     import ipdb; ipdb.set_trace()
    form_df['old_ocrx_label'] = form_df['ocrx_label'].copy()
    form_df['ocrx_label'] = [clean_form_ocrx_helper(el) for el in form_df['old_ocrx_label'].values]
    return form_df

In [8]:
clean_dfs = {key: clean_df(df) for key, df in dfs.items()}

In [9]:
clean_dfs['ALL_IN.json']

Unnamed: 0,rxnorm_code,rxnorm_label
13068,RxNorm#1801150,(((1-METHYL-2-(5-METHYL-3-OXAZOLIDINYL)ETHOXY)...
11064,RxNorm#1599828,"((2S,3S,5S)-2-(2-(2,6-DIMETHYLPHENOXY)ACETAMID..."
13682,RxNorm#2599921,"((4-HYDROXYBUTYL)AZANEDIYL)BIS(HEXANE-6,1-DIYL..."
8894,RxNorm#2557381,(-)-AMBROXIDE
2827,RxNorm#2262004,"(1,1-DIMETHYLETHYL)UREA"
...,...,...
4967,RxNorm#1649134,ZUCCHINI EXTRACT
5902,RxNorm#114176,ZUCLOPENTHIXOL
347,RxNorm#58338,ZUCLOPENTHIXOL ACETATE
349,RxNorm#58339,ZUCLOPENTHIXOL DECANOATE


In [10]:
clean_dfs['DF.json']['rxnorm_label'].values

array(['AUTO-INJECTOR', 'BUCCAL FILM', 'BUCCAL TABLET', 'CARTRIDGE',
       'CHEWABLE EXTENDED RELEASE ORAL TABLET', 'CHEWABLE TABLET',
       'CHEWING GUM', 'DELAYED RELEASE ORAL CAPSULE',
       'DELAYED RELEASE ORAL GRANULES', 'DELAYED RELEASE ORAL TABLET',
       'DISINTEGRATING ORAL TABLET', 'DOUCHE', 'DRUG IMPLANT',
       'DRUG-ELUTING CONTACT LENS', 'DRY POWDER INHALER',
       'EFFERVESCENT ORAL TABLET', 'ENEMA',
       'EXTENDED RELEASE ORAL CAPSULE', 'EXTENDED RELEASE ORAL TABLET',
       'EXTENDED RELEASE SUSPENSION', 'GAS FOR INHALATION',
       'GRANULES FOR ORAL SOLUTION', 'GRANULES FOR ORAL SUSPENSION',
       'INHALATION POWDER', 'INHALATION SOLUTION', 'INHALATION SPRAY',
       'INHALATION SUSPENSION', 'INJECTABLE FOAM', 'INJECTABLE SOLUTION',
       'INJECTABLE SUSPENSION', 'INJECTION', 'INTRAPERITONEAL SOLUTION',
       'INTRATRACHEAL SUSPENSION', 'INTRAUTERINE SYSTEM',
       'IRRIGATION SOLUTION', 'JET INJECTOR', 'MEDICATED BAR SOAP',
       'MEDICATED LIQUID SOAP

In [11]:
#For active ingredients:

In [15]:
def gather_ocrx_medical_term(medical_term):
    df = pd.read_csv(f"/home/james/Datasets/ocrx-owls/match-ocrx/match_{medical_term}_en.csv")
    df = df.drop_duplicates().sort_values("sLabel")
    labels = df['sLabel'].str.upper().values
    codes = df['s'].values
    return [(str(val),str(val_code)) for val, val_code in zip(labels,codes)]

def parse_dosage(dose):
    splitted = dose.split(" ")
    unit = splitted[-1]
    if unit in units_conversion:
        pass
# print('\n'.join([str(el) for el in df_compo['dose'].unique()]))
ocrx_terms = ['form','roa','substance']
ocrx_dfs = [gather_ocrx_medical_term(term) for term in ocrx_terms]

def convert_ocrx_to_df(ocrx_elems):
    return [pd.DataFrame(ocrx_df,columns=['ocrx_label','ocrx_code']) for ocrx_df in ocrx_elems]

ocrx_dfs = convert_ocrx_to_df(ocrx_dfs)
ocrx_dfs[0] = clean_form_ocrx(ocrx_dfs[0])
ocrx_form_df, ocrx_roa_df, ocrx_substance_df = ocrx_dfs

from pytrie import Trie
def make_trie(df):
    code_col, label_col = df.columns
    return {row[label_col]: row[code_col] for _, row in df.iterrows()}

# def make_trie_ansm(ansm_def_df):
#     return {row['snomed_label']: row['snomed_code'] for _, row in ansm_def_df.iterrows()}

def get_prefix(trie,term):
    words = term.split(' ')
    for i in reversed(range(1,len(words)+1)):
        term = ' '.join(words[:i])
        if term in trie:
            return (term, trie[term])
    return None


# snomed_dfs = [clean_form_snomed(clean_dfs['form.json']),clean_dfs['roa.json'],clean_dfs['substances.json']]
# ocrx_form_trie,ocrx_roa_trie,ocrx_substance_trie = [make_trie(el) for el in ocrx_dfs]
# snomed_tries = [make_trie_ansm(el) for el in snomed_dfs]


In [16]:
import os
DIR = 'match-ocrx-full'
def print_stats(df_merged,type_fn,name='rxnorm'):
    # total_match
    prefix_code = f"{name}_code"
    print("TOTAL MATCH:")
    total_df = df_merged.dropna()
    print(total_df.head())
    print(total_df.shape)
    total_df.to_csv(f"{DIR}/match_{type_fn}_total_match.csv")
    # left missing
    print("ONLY OCRX")
    only_ocrx = df_merged.loc[df_merged[prefix_code].isna()]
    print(only_ocrx.head())
    print(only_ocrx.shape)
    only_ocrx.to_csv(f"{DIR}/match_{type_fn}_only_ocrx.csv")
    # right missing
    print(f"ONLY {name.upper()}")
    only_ansm = df_merged.loc[df_merged['ocrx_code'].isna()]
    print(only_ansm.head())
    print(only_ansm.shape)
    stats = {'total' : total_df.shape[0], 'ocrx' : only_ocrx.shape[0], name : only_ansm.shape[0]}
    only_ansm.to_csv(f"{DIR}/match_{type_fn}_only_{name}.csv")
    return stats

def perform_alignment(df1,df2,fn):
    all_stats = dict()
    trie1 = make_trie(df1)
    trie2 = make_trie(df2)
    fullpath = f"{DIR}/match_{fn}.csv"
    df_merged = pd.merge(df1,df2,how="outer",left_on=['ocrx_label'], right_on=['rxnorm_label'])
#     import ipdb; ipdb.set_trace()
    df_merged_with_prefix = add_prefixes(df_merged,trie1,trie2)
    all_stats[fn] = print_stats(df_merged_with_prefix,fn)
    df_merged_with_prefix.to_csv(fullpath)
    return all_stats

def add_prefixes(df,trie,trie_ansm,prefix='rxnorm'):
    prefix_code = f"{prefix}_code"
    prefix_label = f"{prefix}_label"
    def process_row(row,trie):
        if type(row['ocrx_code']) == str and type(row[prefix_code]) == str:
            return (row['ocrx_code'], row['ocrx_label'], 'EXACT')
        elif str(row['ocrx_code']) == 'nan':
            result = get_prefix(trie,row[prefix_label])
            if result is None:
                return (None, None, 'NONE')
            else:
                prefix_ocrx, ocrx_code = result
                return (ocrx_code, prefix_ocrx, 'PREFIX')
        else:
            return (row['ocrx_code'], row['ocrx_label'], 'NA')
    def process_row_ansm(row,trie):
        if type(row['ocrx_code']) == str and type(row[prefix_code]) == str:
            return (row[prefix_code], row[prefix_label], 'EXACT')
        elif str(row[prefix_code]) == 'nan':
            result = get_prefix(trie,row['ocrx_label'])
            if result is None:
                return (None, None, 'NONE')
            else:
                prefix_ansm, ansm_code = result
                return (ansm_code, prefix_ansm, 'PREFIX')
        else:
            return (row[prefix_code], row[prefix_label], 'NA')
        
            
#     import ipdb; ipdb.set_trace()
    els = ['ocrx','snomed']
    new_df = df.copy()
    for el, trie_el, process_fn in zip(els,[trie,trie_ansm],[process_row,process_row_ansm]):
        results = [process_fn(row,trie_el) for _, row in df.iterrows()]
        codes = [result[0] for result in results]
        full_terms = [result[1] for result in results]
        match_types = [result[2] for result in results]
        new_df[f'{el}_code'] = codes
        new_df[f'{el}_label'] = full_terms
        new_df[f'{el}_match_type'] = match_types
    return new_df

# filenames = ['form','roa','substance']
# if not os.path.exists(DIR):
#     os.mkdir(DIR)
# ocrxs = [ocrx_form_df,ocrx_roa_df,ocrx_substance_df]
# # ansms = [bdpm_form_df,bdpm_roa_df,bdpm_substance_df]
# others = snomed_dfs
# others_tries = snomed_tries
# tries = [ocrx_form_trie,ocrx_roa_trie,ocrx_substance_trie]
# all_stats = dict()
# for ocrx, ansm, filename, trie, trie_ansm in zip(ocrxs,others,filenames,tries,others_tries):
#     fullpath = f"{DIR}/match_{filename}.csv"
#     df_merged = pd.merge(ocrx,ansm,how="outer",left_on=['ocrx_label'], right_on=['snomed_label'])
# #     import ipdb; ipdb.set_trace()
#     df_merged_with_prefix = add_prefixes(df_merged,trie,trie_ansm)
#     all_stats[filename] = print_stats(df_merged_with_prefix,filename)
#     df_merged_with_prefix.to_csv(fullpath)

In [79]:
!mkdir match-ocrx-full

mkdir: cannot create directory ‘match-ocrx-full’: File exists


In [207]:
substance_match = (ocrx_substance_df,clean_dfs['ALL_IN.json'],'substance')
substance_stats = perform_alignment(*substance_match)

TOTAL MATCH:
                                    ocrx_label  \
103509                       1-ALPHA-VITAMIN D   
113844                              1-PROPANOL   
160593                          2-PHENYLPHENOL   
211334                    4-PHENYLBUTYRIC ACID   
219939  5-CHLORO-2-METHYL-4-ISOTHIAZOLIN-3-ONE   

                                 ocrx_code     rxnorm_code  \
103509  http://www.ocrx.ca/OCRx/3000004084   RxNorm#350465   
113844  http://www.ocrx.ca/OCRx/3000000930  RxNorm#1362872   
160593  http://www.ocrx.ca/OCRx/3000000039   RxNorm#236677   
211334  http://www.ocrx.ca/OCRx/3100005151  RxNorm#1546447   
219939  http://www.ocrx.ca/OCRx/3000004417  RxNorm#1367119   

                                  rxnorm_label ocrx_match_type  \
103509                       1-ALPHA-VITAMIN D           EXACT   
113844                              1-PROPANOL           EXACT   
160593                          2-PHENYLPHENOL           EXACT   
211334                    4-PHENYLBUTYRIC ACID   

In [17]:
ocrx_form_word_set = set(ocrx_form_df['ocrx_label'].values)

In [18]:
ocrx_roa_word_set = set(ocrx_roa_df['ocrx_label'].values)

In [19]:
list(range(5,3,-1))

[5, 4]

In [20]:
dose_forms = clean_dfs['DF.json']
def split_form_roa(text,form_set=ocrx_form_word_set,roa_set=ocrx_roa_word_set):
    words = text.split(" ")
    form, roa = None, None
    word_len = len(words)
    term = None
    found = 0
    for window_size in range(word_len,0,-1):
        for i in range(word_len-window_size+1):
            word = ' '.join(words[i:i+window_size])
            if word in roa_set:
                term = word
                found = 2
                break
            
        if found != 0:
            break
    if found == 0:
        for window_size in range(word_len,0,-1):
            for i in range(word_len-window_size+1):
                word = ' '.join(words[i:i+window_size])
                if word in form_set:
                    term = word
                    found = 1
                    break

            if found != 0:
                break
                
    if found == 0:
        return None, None, None
    other_term = ' '.join([word for word in words if word != term])
    form, roa = (term, other_term) if found == 1 else (other_term, term)
    splitted_form = form.split(' ')
    if splitted_form[-1] in form_set and splitted_form[0] not in form_set and len(splitted_form) > 1:
        rest = ' '.join(splitted_form[:-1])
        form = splitted_form[-1] + ' ' + f'({rest})'
    if form.endswith(" FOR"):
        form = ' '.join(form.split(' ')[:-1])
    if roa.startswith("FOR "):
        roa = ' '.join(roa.split(' ')[1:])
    if form not in form_set and len(form.split(' ')) == 1:
        return None, roa, form
    print(f"Term:{text}\tForm:{form}\tRoa:{roa}")
    return form, roa, None

def create_form_roa(df):
    splitted = [split_form_roa(term) for term in df['rxnorm_label'].values]
    forms = [el[0] for el in splitted]
    roas = [el[1] for el in splitted]
    guessed_forms = [el[2] for el in splitted]
    df['form'] = forms
    df['roa'] = roas
    df['guessed_form'] = guessed_forms
    return df

In [21]:
split_form_roa('ORAL TABLET')

Term:ORAL TABLET	Form:TABLET	Roa:ORAL


('TABLET', 'ORAL', None)

In [22]:
'ORAL TABLET' in ocrx_form_word_set

True

In [23]:
create_form_roa(dose_forms)

Term:BUCCAL TABLET	Form:TABLET	Roa:BUCCAL
Term:CHEWABLE EXTENDED RELEASE ORAL TABLET	Form:TABLET (CHEWABLE EXTENDED RELEASE)	Roa:ORAL
Term:CHEWABLE TABLET	Form:TABLET	Roa:CHEWABLE
Term:CHEWING GUM	Form:GUM	Roa:CHEWING
Term:DELAYED RELEASE ORAL CAPSULE	Form:CAPSULE (DELAYED RELEASE)	Roa:ORAL
Term:DELAYED RELEASE ORAL GRANULES	Form:GRANULES (DELAYED RELEASE)	Roa:ORAL
Term:DELAYED RELEASE ORAL TABLET	Form:TABLET (DELAYED RELEASE)	Roa:ORAL
Term:DISINTEGRATING ORAL TABLET	Form:TABLET (DISINTEGRATING)	Roa:ORAL
Term:DOUCHE	Form:DOUCHE	Roa:
Term:DRY POWDER INHALER	Form:POWDER	Roa:DRY INHALER
Term:EFFERVESCENT ORAL TABLET	Form:TABLET (EFFERVESCENT)	Roa:ORAL
Term:ENEMA	Form:ENEMA	Roa:
Term:EXTENDED RELEASE ORAL CAPSULE	Form:CAPSULE (EXTENDED RELEASE)	Roa:ORAL
Term:EXTENDED RELEASE ORAL TABLET	Form:TABLET (EXTENDED RELEASE)	Roa:ORAL
Term:EXTENDED RELEASE SUSPENSION	Form:SUSPENSION	Roa:EXTENDED RELEASE
Term:GAS FOR INHALATION	Form:GAS	Roa:INHALATION
Term:GRANULES FOR ORAL SOLUTION	Form:GRANULES FO

Unnamed: 0,rxnorm_code,rxnorm_label,form,roa,guessed_form
54,RxNorm#1649570,AUTO-INJECTOR,,,
119,RxNorm#858080,BUCCAL FILM,,BUCCAL,FILM
53,RxNorm#970789,BUCCAL TABLET,TABLET,BUCCAL,
55,RxNorm#1649572,CARTRIDGE,,,
47,RxNorm#2269573,CHEWABLE EXTENDED RELEASE ORAL TABLET,TABLET (CHEWABLE EXTENDED RELEASE),ORAL,
...,...,...,...,...,...
122,RxNorm#11108,VAGINAL GEL,GEL,VAGINAL,
52,RxNorm#2107950,VAGINAL INSERT,INSERT,VAGINAL,
86,RxNorm#317010,VAGINAL OINTMENT,OINTMENT,VAGINAL,
61,RxNorm#2173190,VAGINAL SPONGE,SPONGE,VAGINAL,


In [24]:

pd.set_option('display.max_rows', None)
dose_forms

Unnamed: 0,rxnorm_code,rxnorm_label,form,roa,guessed_form
54,RxNorm#1649570,AUTO-INJECTOR,,,
119,RxNorm#858080,BUCCAL FILM,,BUCCAL,FILM
53,RxNorm#970789,BUCCAL TABLET,TABLET,BUCCAL,
55,RxNorm#1649572,CARTRIDGE,,,
47,RxNorm#2269573,CHEWABLE EXTENDED RELEASE ORAL TABLET,TABLET (CHEWABLE EXTENDED RELEASE),ORAL,
29,RxNorm#91058,CHEWABLE TABLET,TABLET,CHEWABLE,
65,RxNorm#402499,CHEWING GUM,GUM,CHEWING,
13,RxNorm#316995,DELAYED RELEASE ORAL CAPSULE,CAPSULE (DELAYED RELEASE),ORAL,
12,RxNorm#2284290,DELAYED RELEASE ORAL GRANULES,GRANULES (DELAYED RELEASE),ORAL,
83,RxNorm#10312,DELAYED RELEASE ORAL TABLET,TABLET (DELAYED RELEASE),ORAL,


In [25]:
dose_forms.to_csv("dose_forms_parsed.csv") 

In [26]:
rxnorm_form_df = pd.DataFrame([{"rxnorm_id" : i, "rxnorm_label" : term} for i, term in enumerate(set(list(dose_forms['form']) + list(dose_forms['guessed_form'].values))) if term != "" and term is not None])
rxnorm_roa_df = pd.DataFrame([{"rxnorm_id" : i, "rxnorm_label" : term} for i, term in enumerate(set(list(dose_forms['roa']))) if term != "" and term is not None])

In [27]:
rxnorm_roa_df

Unnamed: 0,rxnorm_id,rxnorm_label
0,1,MEDICATED SOAP
1,2,CHEWING
2,3,DRY INHALER
3,4,TRANSDERMAL
4,5,OPHTHALMIC
5,6,NASAL
6,7,VAGINAL
7,8,EXTENDED RELEASE
8,9,BUCCAL
9,10,TOPICAL


In [28]:
rxnorm_form_df.columns = ['rxnorm_code','rxnorm_label']

In [29]:
rxnorm_roa_df.columns = ['rxnorm_code', 'rxnorm_label']

In [32]:
import os
if not os.path.exists("export"):
    os.mkdir("export")
def export_df(df,term):
    new_df = df.copy()
    new_df.columns = ['rxnorm_code','rxnorm_label']
    new_df = new_df.drop_duplicates().dropna().sort_values('rxnorm_label')
    new_df.to_csv(f'export/rxnorm_{term}_en.csv')
dfs['form.json'] = rxnorm_form_df
dfs['roa.json'] = rxnorm_roa_df

filenames = ['form.json','roa.json','SCD.json','SCDC.json','ALL_IN.json']
maps = ['form','roa','drug','component','active_ingredient']
for mapped_name, fn in zip(maps,filenames):
    export_df(dfs[fn],mapped_name)

In [1]:
!nemo .

In [33]:
!nemo .

In [203]:
ocrx_form_df, ocrx_roa_df = [df[['ocrx_code','ocrx_label']] for df in [ocrx_form_df,ocrx_roa_df]]
form_stats = perform_alignment(ocrx_form_df,rxnorm_form_df,'form')
roa_stats = perform_alignment(ocrx_roa_df,rxnorm_roa_df,'roa')

TOTAL MATCH:
                              ocrx_code ocrx_label  rxnorm_code rxnorm_label  \
65   http://www.ocrx.ca/OCRx/4000000120        BAR         30.0          BAR   
66   http://www.ocrx.ca/OCRx/4000000120        BAR         30.0          BAR   
77   http://www.ocrx.ca/OCRx/4000000017    CAPSULE          3.0      CAPSULE   
78   http://www.ocrx.ca/OCRx/4000000017    CAPSULE          3.0      CAPSULE   
120  http://www.ocrx.ca/OCRx/4000000065      CREAM         34.0        CREAM   

    ocrx_match_type  snomed_code snomed_label snomed_match_type  
65               NA         30.0          BAR                NA  
66               NA         30.0          BAR                NA  
77               NA          3.0      CAPSULE                NA  
78               NA          3.0      CAPSULE                NA  
120              NA         34.0        CREAM                NA  
(88, 8)
ONLY OCRX
                            ocrx_code                         ocrx_label  \
0  http://www.oc

In [204]:
form_stats

{'form': {'total': 88, 'ocrx': 1715, 'rxnorm': 5}}

In [205]:
roa_stats

{'roa': {'total': 17, 'ocrx': 81, 'rxnorm': 10}}

In [208]:
all_stats = [substance_stats, form_stats, roa_stats]

In [215]:
all_dict = dict()
for stat in all_stats:
    for key, val in stat.items():
        all_dict[key] = val

In [218]:
pd.DataFrame([{'name' : key, **rest} for key, rest in all_dict.items()])

Unnamed: 0,name,total,ocrx,rxnorm
0,substance,3603,2826036,13715
1,form,88,1715,5
2,roa,17,81,10


In [None]:
!alacritty