In [44]:
import pandas as pd
import pickle
from universal_drug_parser import Drug, Component, mega_mega_dict

MEGA_DICT_FN = 'ner-results/all_drugs.pickle'
def load_mega_dict(fn=MEGA_DICT_FN):
    with open(fn,"rb") as f:
        return pickle.load(f)

mega_dict = load_mega_dict()

In [45]:
mega_dict.keys()

dict_keys(['ocrx_fr', 'ocrx_en', 'snomed_en', 'dailymed_en', 'rxnorm_en', 'ansm_fr'])

In [46]:
from collections import defaultdict
# construct a dict where it is indexed by substance, and all drugs that contain it.
active_ingredient_dicts = dict()

def update_dict(active_ingredient_dict,drug):
    compos = drug.components
    ing_list =  tuple([el[0] for el in sorted(compos,key=lambda el: el[0])])
    active_ingredient_dict[ing_list].append(drug)
for key in mega_dict:
    active_ingredient_dict = defaultdict(lambda : [])
    drug_list = mega_dict[key]
    for drug in drug_list:
        if drug is not None:
            update_dict(active_ingredient_dict, drug)
    active_ingredient_dicts[key] = dict(active_ingredient_dict)

In [47]:
Drug.parse_labels(['ACETAMINOPHEN 650 MG/SACHET | ASCORBIC ACID 250 MG/SACHET | CAFFEINE 30 MG/SACHET | CHLORPHENIRAMINE MALEATE 4 MG/SACHET | DEXTROMETHORPHAN HYDROBROMIDE 20 MG/SACHET AS GRANULES FOR SOLUTION IN ORAL'],'ocrx_en')

[Original: ACETAMINOPHEN 650 MG/SACHET | ASCORBIC ACID 250 MG/SACHET | CAFFEINE 30 MG/SACHET | CHLORPHENIRAMINE MALEATE 4 MG/SACHET | DEXTROMETHORPHAN HYDROBROMIDE 20 MG/SACHET AS GRANULES FOR SOLUTION IN ORAL
 Components: [('ACETAMINOPHEN', '650 MG/SACHET'), ('ASCORBIC ACID', '250 MG/SACHET'), ('CAFFEINE', '30 MG/SACHET'), ('CHLORPHENIRAMINE MALEATE', '4 MG/SACHET'), ('DEXTROMETHORPHAN HYDROBROMIDE', '20 MG/SACHET')]. Form: GRANULES FOR SOLUTION. Route: ORAL.
 ----------------]

In [48]:
[el for el in mega_dict['ocrx_en'] if 'ACETAMINOPHEN' in el.drug and 'ASCORBIC ACID 250 MG/SACHET' in el.drug]

[Original: ACETAMINOPHEN 650 MG/SACHET | ASCORBIC ACID 250 MG/SACHET | CAFFEINE 30 MG/SACHET | CHLORPHENIRAMINE MALEATE 4 MG/SACHET | DEXTROMETHORPHAN HYDROBROMIDE 20 MG/SACHET AS GRANULES FOR SOLUTION IN ORAL
 Components: [('ACETAMINOPHEN', '650 MG/SACHET'), ('ASCORBIC ACID', '250 MG/SACHET'), ('CAFFEINE', '30 MG/SACHET'), ('CHLORPHENIRAMINE MALEATE', '4 MG/SACHET'), ('DEXTROMETHORPHAN HYDROBROMIDE', '20 MG/SACHET')]. Form: GRANULES FOR SOLUTION. Route: ORAL.
 ----------------]

In [49]:
[el for el in active_ingredient_dicts['rxnorm_en'].keys() if 'ACETAMINOPHEN' in el]

[('ACETAMINOPHEN',),
 ('ACETAMINOPHEN', 'CAFFEINE', 'CHLORPHENIRAMINE MALEATE'),
 ('ACETAMINOPHEN',
  'CHLORPHENIRAMINE MALEATE',
  'DEXTROMETHORPHAN HYDROBROMIDE'),
 ('ACETAMINOPHEN', 'DEXTROMETHORPHAN HYDROBROMIDE'),
 ('ACETAMINOPHEN', 'ASPIRIN', 'CAFFEINE', 'SALICYLAMIDE'),
 ('ACETAMINOPHEN', 'SODIUM SALICYLATE'),
 ('ACETAMINOPHEN', 'CAFFEINE'),
 ('ACETAMINOPHEN', 'ASPIRIN', 'CAFFEINE'),
 ('ACETAMINOPHEN',
  'DEXTROMETHORPHAN HYDROBROMIDE',
  'PSEUDOEPHEDRINE HYDROCHLORIDE'),
 ('ACETAMINOPHEN',
  'CHLORPHENIRAMINE MALEATE',
  'DEXTROMETHORPHAN HYDROBROMIDE',
  'PHENYLEPHRINE HYDROCHLORIDE'),
 ('ACETAMINOPHEN',
  'DEXTROMETHORPHAN HYDROBROMIDE',
  'GUAIFENESIN',
  'PHENYLEPHRINE HYDROCHLORIDE'),
 ('ACETAMINOPHEN', 'HYDROCODONE BITARTRATE'),
 ('ACETAMINOPHEN',
  'DEXTROMETHORPHAN HYDROBROMIDE',
  'DIPHENHYDRAMINE',
  'GUAIFENESIN',
  'PHENYLEPHRINE HYDROCHLORIDE'),
 ('ACETAMINOPHEN', 'BUTALBITAL', 'CAFFEINE'),
 ('ACETAMINOPHEN',
  'DEXTROMETHORPHAN HYDROBROMIDE',
  'DOXYLAMINE SUCCINA

In [50]:
active_ingredient_dicts['dailymed_en']

{('.ALPHA.-TOCOPHEROL ACETATE, D-',
  'CALCIUM CARBONATE',
  'CALCIUM FORMATE',
  'CHOLECALCIFEROL',
  'CHOLINE BITARTRATE',
  'CYANOCOBALAMIN',
  'FERROUS BISGLYCINATE',
  'FOLIC ACID',
  'NIACINAMIDE',
  'POTASSIUM IODIDE',
  'PYRIDOXINE HYDROCHLORIDE',
  'RIBOFLAVIN',
  'SODIUM ASCORBATE',
  'THIAMINE MONONITRATE',
  'ZINC OXIDE'): [Original: .ALPHA.-TOCOPHEROL ACETATE, D- 30 [IU] | RIBOFLAVIN 3 MG | FOLIC ACID 1 MG | NIACINAMIDE 20 MG | CYANOCOBALAMIN 10 UG | POTASSIUM IODIDE 100 UG | ZINC OXIDE 10 MG | SODIUM ASCORBATE 120 MG | THIAMINE MONONITRATE 3 MG | CHOLECALCIFEROL 450 [IU] | PYRIDOXINE HYDROCHLORIDE 50 MG | CALCIUM FORMATE 155 MG | CALCIUM CARBONATE 45 MG | FERROUS BISGLYCINATE 32 MG | CHOLINE BITARTRATE 55 MG WITH FORM OF TABLET, FILM COATED WITH ROA OF ORAL
  Components: [('.ALPHA.-TOCOPHEROL ACETATE, D-', '30 [IU]'), ('RIBOFLAVIN', '3 MG'), ('FOLIC ACID', '1 MG'), ('NIACINAMIDE', '20 MG'), ('CYANOCOBALAMIN', '10 UG'), ('POTASSIUM IODIDE', '100 UG'), ('ZINC OXIDE', '10 MG

In [51]:
def make_df_from_dict(some_dict):
    df = pd.DataFrame()
    df['drug_labels'] = [' | '.join(el) for el in some_dict]
#     import ipdb; ipdb.set_trace()
    df['drug_names'] = [(val) for val in some_dict.values()]
#     df['drug_codes'] = [val.code for val in some_dict.values()]
    return df

# def run_matcher()
def get_matching_pairs(ocrx_dict,other_dict):
    new_ocrx_df = make_df_from_dict(ocrx_dict)
    new_other_df = make_df_from_dict(other_dict)
    merged_pair = pd.merge(new_ocrx_df,new_other_df,on='drug_labels',how='outer')
    return merged_pair

merged_dfs = dict()
for key in active_ingredient_dicts:
    if key.startswith('ocrx'):
        continue
    
    name,lang = key.split('_')
    merged_dfs[key] = get_matching_pairs(active_ingredient_dicts[f'ocrx_{lang}'], active_ingredient_dicts[key])
    

In [52]:
no_na_dfs = dict()
for key in merged_dfs:
    print(key)
    no_na = merged_dfs[key].dropna()
    print(no_na.shape[0])
    no_na_dfs[key] = no_na
#     print(no_na['drug_names_x'].unique().shape[0])
#     print(no_na['drug_names_y'].unique().shape[0])

snomed_en
1308
dailymed_en
1285
rxnorm_en
1669
ansm_fr
799


In [53]:
import re
numb_patt = re.compile('[0-9\. ]*[0-9]+')
numb_patt.findall('20012.5 MG / TABLET')

['20012.5']

In [115]:
%pdb on
import re

def kinda_match(word1, word2):
    if word1 is None or word2 is None:
        return False
    if word1 == word2:
        return True
    tok1 = word1.split(' ')[0]
    tok2 = word2.split(' ')[0]
    return tok1 == tok2
def tofloat(el_raw):
    el = el_raw.strip()
    splitted = el.split(' ')
    if len(splitted) > 1:
        if any([len(el) < 3 for el in splitted[1:]]):
            return float(splitted[0])
    no_space = ''.join(splitted)
    return float(no_space)

def strength_match(strength1,strength2):
    if strength1 is None or strength2 is None:
        return False
    numbs_1 = [tofloat(el) for el in numb_patt.findall(strength1)]
    numbs_2 = [tofloat(el) for el in numb_patt.findall(strength2)]
    if len(numbs_1) == 0 or len(numbs_2) == 0:
        return False
    numb_1 = numbs_1[0]
    numb_2 = numbs_2[0]
    
    if numb_1 == numb_2 or numb_1 * 1000 == numb_2 or numb_2 * 1000 == numb_1:
#         if numb_1 * 1000 == numb_2 or numb_2 * 1000 == numb_1:
        return True
    return False
    

def is_match(drug,ocrx_drug):
    # check form first
    form_matches = kinda_match(ocrx_drug.form,drug.form)
    strengths = [ el[1] for el in sorted(drug.components,key=lambda el: el[0]) ]
    strengths_ocrx = [ el[1] for el in sorted(ocrx_drug.components,key=lambda el: el[0]) ]
    strength_matches = all([strength_match(strength1, strength2) for strength1, strength2 in zip(strengths,strengths_ocrx)])
    return form_matches and strength_matches

def find_ocrx_match(drug,ocrx_drugs):
    return [ocrx_drug for ocrx_drug in ocrx_drugs if is_match(drug,ocrx_drug)]
            
# now filter by basic form and strength as well.
mega_table = dict()
not_matched_dict = dict()
for key, df in no_na_dfs.items():
    match_list = list()
    not_match_list = list()
    for _, row in df.iterrows():
        ocrx = row['drug_names_x']
        other = row['drug_names_y']
        matched_ids = set()
        matched_ids_other = set()
        for other_drug in other:
            ocrx_matches = find_ocrx_match(other_drug,ocrx)
            matched_ids.update([el.code for el in ocrx_matches])
            if len(ocrx_matches) > 0:
                matched_ids_other.add(other_drug.code)
            match_list.append((other_drug, ocrx_matches))
        not_matched_ocrx = [el for el in ocrx if el.code not in matched_ids]
        not_matched_other = [el for el in other if el.code not in matched_ids_other]
        not_match_list.append((not_matched_other,not_matched_ocrx))
    not_matched_dict[key] = not_match_list
    mega_table[key] = match_list

Automatic pdb calling has been turned ON


In [116]:
Drug.parse_labels(['6-AMINOCAPROIC ACID 500 MG/TAB AS TABLET IN ORAL'],'ocrx_en')

[Original: 6-AMINOCAPROIC ACID 500 MG/TAB AS TABLET IN ORAL
 Components: [('6-AMINOCAPROIC ACID', '500 MG/TAB')]. Form: TABLET. Route: ORAL.
 ----------------]

In [117]:
Drug.parse_labels(['6-AMINOCAPROIC ACID 500 MG ORAL TABLET'],'rxnorm_en')  

[Original: 6-AMINOCAPROIC ACID 500 MG ORAL TABLET
 Components: [('6-AMINOCAPROIC ACID', '500 MG')]. Form: TABLET. Route: ORAL.
 ----------------]

In [118]:
from pprint import pprint
for el in [{'other' : [other_el.drug for other_el in other], 'ocrx' : [ocrx_el.drug for ocrx_el in ocrx]} for other, ocrx in not_matched_dict['rxnorm_en'][:5]]:
    pprint(el)
    print('\n\n------------------\n\n')

{'ocrx': ['6-AMINOCAPROIC ACID 250 MG/ML AS LIQUID IN INTRAVENOUS',
          '6-AMINOCAPROIC ACID 250 MG/ML AS SYRUP IN ORAL'],
 'other': ['20 ML 6-AMINOCAPROIC ACID 250 MG/ML INJECTION',
           '6-AMINOCAPROIC ACID 1000 MG ORAL TABLET',
           '6-AMINOCAPROIC ACID 250 MG/ML INJECTABLE SOLUTION',
           '6-AMINOCAPROIC ACID 250 MG/ML ORAL SOLUTION']}


------------------


{'ocrx': ['ABATACEPT .125 G/ML AS SOLUTION IN SUBCUTANEOUS',
          'ABATACEPT 0.125 G/ML AS SOLUTION IN SUBCUTANEOUS',
          'ABATACEPT 0.25 G/VIAL AS POWDER FOR SOLUTION IN INTRAVENOUS',
          'ABATACEPT 125 MG/ML AS SOLUTION IN SUBCUTANEOUS',
          'ABATACEPT 250 MG/VIAL AS POWDER FOR SOLUTION IN INTRAVENOUS'],
 'other': ['0.4 ML ABATACEPT 125 MG/ML PREFILLED SYRINGE',
           '0.7 ML ABATACEPT 125 MG/ML PREFILLED SYRINGE',
           '1 ML ABATACEPT 125 MG/ML AUTO-INJECTOR',
           '1 ML ABATACEPT 125 MG/ML PREFILLED SYRINGE',
           'ABATACEPT 250 MG INJECTION']}


--------

In [119]:
test = no_na_dfs['rxnorm_en'][no_na_dfs['rxnorm_en']['drug_labels'] == 'ZINC OXIDE']
test['drug_names_x'].values[0][:5]

[Original: ZINC OXIDE 1 DH/TAB AS TABLET IN ORAL
 Components: [('ZINC OXIDE', '1 DH/TAB')]. Form: TABLET. Route: ORAL.
 ----------------,
 Original: ZINC OXIDE 10 % AS CREAM IN TOPICAL
 Components: [('ZINC OXIDE', '10 %')]. Form: CREAM. Route: TOPICAL.
 ----------------,
 Original: ZINC OXIDE 10 % AS POWDER IN TOPICAL
 Components: [('ZINC OXIDE', '10 %')]. Form: POWDER. Route: TOPICAL.
 ----------------,
 Original: ZINC OXIDE 10 %/% AS POWDER IN TOPICAL
 Components: [('ZINC OXIDE', '10 %/%')]. Form: POWDER. Route: TOPICAL.
 ----------------,
 Original: ZINC OXIDE 10.0 % AS CREAM IN TOPICAL
 Components: [('ZINC OXIDE', '10.0 %')]. Form: CREAM. Route: TOPICAL.
 ----------------]

In [120]:
test['drug_names_y'].values[0][:5]

[Original: ZINC OXIDE 0.01 MG/MG TOPICAL OINTMENT
 Components: [('ZINC OXIDE', '0.01 MG/MG')]. Form: OINTMENT. Route: TOPICAL.
 ----------------,
 Original: ZINC OXIDE 0.018 MG/MG TOPICAL OINTMENT
 Components: [('ZINC OXIDE', '0.018 MG/MG')]. Form: OINTMENT. Route: TOPICAL.
 ----------------,
 Original: ZINC OXIDE 0.038 MG/MG TOPICAL OINTMENT
 Components: [('ZINC OXIDE', '0.038 MG/MG')]. Form: OINTMENT. Route: TOPICAL.
 ----------------,
 Original: ZINC OXIDE 0.049 MG/MG TOPICAL OINTMENT
 Components: [('ZINC OXIDE', '0.049 MG/MG')]. Form: OINTMENT. Route: TOPICAL.
 ----------------,
 Original: ZINC OXIDE 0.0938 MG/MG TOPICAL OINTMENT
 Components: [('ZINC OXIDE', '0.0938 MG/MG')]. Form: OINTMENT. Route: TOPICAL.
 ----------------]

In [60]:
clean_mega_table = dict()
clean_ids_mega_table = dict()
for key, els in mega_table.items():
    clean_mega_table[key] = [(el, set([foo.code for foo in el[1]])) for el in els if len(el[1]) > 0]
    print(key)
    print(len(clean_mega_table[key]))

snomed_en
2266
dailymed_en
13389
rxnorm_en
3016
ansm_fr
3482


In [61]:
mega_dict.keys()

dict_keys(['ocrx_fr', 'ocrx_en', 'snomed_en', 'dailymed_en', 'rxnorm_en', 'ansm_fr'])

In [62]:
from collections import Counter
def filter_dupes(ocrx_drugs):
    ocrx_list = []
    id_set = set()
    for el in ocrx_drugs:
        if el.code not in id_set:
            ocrx_list.append(el)
            id_set.add(el.code)
    return ocrx_list
preC_mega_stats = dict()

preC_ids = dict()
for term in mega_table:
    clean_list = [(el[0],filter_dupes(el[1])) for el in mega_table[term] if len(el[1]) > 0]
    all_ids = {popo[0].code for popo in clean_list}
#     print(all_ids[:20])
    print(term)
    print(Counter(all_ids).most_common(3))
#     assert len(set(all_ids)) == len(all_ids), 'Drugs from ansm/snomed/rxnorm are not unique ids'
    count_1_n = len({el[0].code for el in clean_list if len(el[1]) > 1 })
    count_1_1 = len({el[0].code for el in clean_list if len(el[1]) == 1})
    preC_ids_el = {el[0].code for el in clean_list if len(el[1]) > 0}
    preC_ids[term] = preC_ids_el
    all_ids = {el.code for el in mega_dict[term] if el is not None}
    count_1_0 = len(all_ids) - count_1_n - count_1_1
    assert len(all_ids) - len(preC_ids_el) == count_1_0
    stat_dict = {'1-n' : count_1_n, '1-1' : count_1_1, '1-0' : count_1_0}
    preC_mega_stats[term] = stat_dict
    

snomed_en
[(425984002, 1), (765952003, 1), (1153450000, 1)]
dailymed_en
[('0904-7136', 1), ('54868-4349', 1), ('70518-2190', 1)]
rxnorm_en
[('RxNorm#283407', 1), ('RxNorm#855296', 1), ('RxNorm#1606308', 1)]
ansm_fr
[(66961409, 1), (66895877, 1), (62414855, 1)]


In [63]:
preC_mega_stats

{'snomed_en': {'1-n': 605, '1-1': 1661, '1-0': 5628},
 'dailymed_en': {'1-n': 5255, '1-1': 8132, '1-0': 32413},
 'rxnorm_en': {'1-n': 839, '1-1': 2177, '1-0': 14258},
 'ansm_fr': {'1-n': 1032, '1-1': 2450, '1-0': 12213}}

In [64]:
def clean_zeros(strength):
    if strength is None:
        return None
    zeros_patt = re.compile('\.0+')
    num_patt = re.compile('[0-9]')
    new_str = ' '.join([tok.replace(',','.') if num_patt.sub('',tok) == ',' else tok for tok in strength.split(' ')])
    return zeros_patt.sub('',new_str)
for key in active_ingredient_dicts:
    print(key)
    print(len(active_ingredient_dicts[key]))


ocrx_fr
28572
ocrx_en
72214
snomed_en
2724
dailymed_en
3044
rxnorm_en
4882
ansm_fr
4203


In [139]:
%pdb on
from universal_drug_parser import drug_is_matched


def valid_drug(el):
    return len(el.components) > 0 and all([len(a) == 2 for a in el.components]) and 'HOMÉOPATHIQUES' not in el.drug
def is_matched(el):
    return all([compo[0] is not None for compo in el['components']]) and len(el['components']) > 0
aligned_dict = dict()
for key,drug_list in mega_dict.items():
    if key.startswith('ocrx'):
        continue
    lang = key.split('_')[1]
    valid_drug_list = [el for el in drug_list if valid_drug(el) and el.code not in preC_ids[key]]
    aligned = [(el.align_to_database(lang=lang), el) for el in valid_drug_list ]
    aligned_correct = {el[1].code for el in aligned if drug_is_matched(el[0])}
    not_aligned = {el[1].code for el in aligned if not drug_is_matched(el[0])}
    print(key)
    pct = 100 * len(aligned_correct) / len(valid_drug_list)
    print(f"{len(aligned_correct)} / {len(valid_drug_list)} total drugs. Percentage covered: {pct:.2f}%")
    print(len(not_aligned))
    aligned_dict[key] = (aligned,aligned_correct, not_aligned)

Automatic pdb calling has been turned ON
snomed_en
4060 / 5628 total drugs. Percentage covered: 72.14%
1568
dailymed_en
9242 / 32471 total drugs. Percentage covered: 28.46%
23176
rxnorm_en
6740 / 14258 total drugs. Percentage covered: 47.27%
7518
ansm_fr
1549 / 10726 total drugs. Percentage covered: 14.44%
9177


In [140]:
databases_lang = {
    'en' : ['snomed','rxnorm','dailymed'],
    'fr' : ['ansm']
}
ocrx_aligned_dict = dict()
for key,drug_list in mega_dict.items():
    if not key.startswith('ocrx'):
        continue
    lang = key.split('_')[1]
    for database_name in databases_lang[lang]:
        valid_drug_list = [el for el in drug_list if el is not None and valid_drug(el)]
        aligned = [el.align_to_database(database=database_name,lang=lang) for el in valid_drug_list ]
        aligned_correct = [el for el in aligned if is_matched(el)]
        print(database_name)
        pct = 100 * len(aligned_correct) / len(valid_drug_list)
        print(f"{len(aligned_correct)} / {len(valid_drug_list)} total drugs. Percentage covered: {pct:.2f}%")
        ocrx_aligned_dict[database_name] = (valid_drug_list, aligned,aligned_correct)

ansm
29539 / 243909 total drugs. Percentage covered: 12.11%
snomed
150518 / 708849 total drugs. Percentage covered: 21.23%
rxnorm
144833 / 708849 total drugs. Percentage covered: 20.43%
dailymed
94294 / 708849 total drugs. Percentage covered: 13.30%


In [141]:
# i = 0
content = ''
terms = ['snomed','rxnorm','ansm','dailymed']
not_matched_subs = set()
matched_subs = set()
for term in terms:
    for drug, aligned in zip(ocrx_aligned_dict[term][0],ocrx_aligned_dict[term][1]):
        if is_matched(aligned):
            matched_subs.update([el[0] for el in drug.components])
            continue
        content += str(drug) + '\n'
        content += str(aligned) + '\n'
        not_matched_subs.update([el[0] for el in drug.components])
#         i += 1
    with open(f'ner-results/not_aligned_{term}.txt',"w") as f:
        f.write(content)
    



In [142]:
content = '\n'.join(sorted(list(not_matched_subs)))
with open('ner-results/ocrx-not_matched.txt',"w") as f:
    f.write(content)
    
content = '\n'.join(sorted(list(matched_subs)))
with open('ner-results/ocrx_matched.txt',"w") as f:
    f.write(content)

In [143]:
from universal_drug_parser import mega_mega_dict

In [144]:
mega_mega_dict['snomed']

{'active_ingredient_en': {'(+)-DELTA-CADINENE SYNTHASE': 130867000,
  '(+)-NEOMENTHOL DEHYDROGENASE': 38132002,
  '(+)-SABINOL DEHYDROGENASE': 129946004,
  '(-)-BORNEOL DEHYDROGENASE': 129945000,
  '(-)-ENDO-FENCHOL SYNTHASE': 130871002,
  '(-)-LIMONENE 3-MONOOXYGENASE': 130170003,
  '(-)-LIMONENE 6-MONOOXYGENASE': 130056000,
  '(-)-LIMONENE 7-MONOOXYGENASE': 130171004,
  '(-)-MENTHOL DEHYDROGENASE': 51303009,
  '(-)-MENTHOL MONOOXYGENASE': 130054002,
  '(1,4)-ALPHA-D-GLUCAN 1-ALPHA-D-GLUCOSYLMUTASE': 130899006,
  '(1-HYDROXYCYCLOHEXAN-1-YL)ACETYL-COA LYASE': 130826005,
  '(2,3-DIHYDROXYBENZOYL)ADENYLATE SYNTHASE': 130453000,
  '(2-AMINOETHYL) PHOSPHONATE-PYRUVATE AMINOTRANSFERASE': 31857001,
  '(3S,4R)-3,4-DIHYDROXYCYCLOHEXA-1,5-DIENE-1,4-DICARBOXYLATE DEHYDROGENASE': 130111005,
  '(N-ACETYLNEURAMINYL)-GALACTOSYLGLUCOSYLCERAMIDE N-ACETYLGALACTOSAMINYLTRANSFERASE': 90019007,
  '(R)-2-HYDROXY-FATTY-ACID DEHYDROGENASE': 16969002,
  '(R)-2-METHYLMALATE DEHYDRATASE': 74678005,
  '(R)-20-HY

aligned_dict

In [145]:
aligned_dict

{'snomed_en': ([({'components': [(('DACTINOMYCIN',
        'DACTINOMYCIN',
        'http://www.ocrx.ca/OCRx/3000000435'),
       ('0.5 MG POWDER', '0.5 MG', 'http://www.ocrx.ca/OCRx/S9220'))],
     'form': ('SOLUTION', 'SOLUTION', 'http://www.ocrx.ca/OCRx/4000000045'),
     'roa': None},
    Original: DACTINOMYCIN 0.5 MG POWDER FOR SOLUTION IN INJECTION VIAL
    Components: [('DACTINOMYCIN', '0.5 MG POWDER')]. Form: SOLUTION. Route: INJECTION VIAL.
    ----------------),
   ({'components': [(('CLOTRIMAZOLE',
        'CLOTRIMAZOLE',
        'http://www.ocrx.ca/OCRx/3000002149'),
       ('100 MG/G', '100 MG/G', 'http://www.ocrx.ca/OCRx/S10580'))],
     'form': ('CREAM', 'CREAM', 'http://www.ocrx.ca/OCRx/4000000065'),
     'roa': ('VAGINAL', 'VAGINAL', 'http://www.ocrx.ca/OCRx/4100000034')},
    Original: CLOTRIMAZOLE 100 MG/G VAGINAL CREAM
    Components: [('CLOTRIMAZOLE', '100 MG/G')]. Form: CREAM. Route: VAGINAL.
    ----------------),
   ({'components': [(('PHENOL',
        'PHENOL',


In [146]:
# Part 2: calculating components

In [147]:
import pickle
from universal_drug_parser import Drug, Component
with open("ner-results/all_compos.pickle","rb") as f:
    mega_dict_compo = pickle.load(f)

In [148]:
from collections import defaultdict
compo_activ_dict = dict()
for key,compo_list in mega_dict_compo.items():
    mapping = defaultdict(lambda:[])
    for compo in compo_list:
        mapping[compo.name].append(compo)
    compo_activ_dict[key] = dict(mapping)

In [149]:
 compo_activ_dict['rxnorm_en']

{'': [Strength: None. Active ingredient: ],
 '0.25 MG,': [Strength: 0.5 MG DOSE 1.5 ML SEMAGLUTIDE 1.34 MG/ML. Active ingredient: 0.25 MG,,
  Strength: 0.5 MG DOSE 3 ML SEMAGLUTIDE 0.68 MG/ML. Active ingredient: 0.25 MG,],
 '0.5 UNT DOSES': [Strength: 3 ML INSULIN LISPRO 100 UNT/ML. Active ingredient: 0.5 UNT DOSES,
  Strength: 3 ML INSULIN LISPRO-AABC 100 UNT/ML. Active ingredient: 0.5 UNT DOSES],
 '1 ACTUAT LOXAPINE': [Strength: 10 MG/ACTUAT DRY. Active ingredient: 1 ACTUAT LOXAPINE],
 '1 MG DOSE': [Strength: 1.5 ML SEMAGLUTIDE 1.34 MG/ML. Active ingredient: 1 MG DOSE],
 '10 ACTUAT OLODATEROL': [Strength: 025 MG/ACTUAT. Active ingredient: 10 ACTUAT OLODATEROL],
 '10 ACTUAT TIOTROPIUM': [Strength: 025 MG/ACTUAT. Active ingredient: 10 ACTUAT TIOTROPIUM],
 '100 ACTUAT BECLOMETHASONE DIPROPIONATE': [Strength: 08 MG/ACTUAT. Active ingredient: 100 ACTUAT BECLOMETHASONE DIPROPIONATE,
  Strength: 04 MG/ACTUAT. Active ingredient: 100 ACTUAT BECLOMETHASONE DIPROPIONATE],
 '1000 MG ESTRADIOL': 

In [150]:
compo_match_dict = dict()

for key, activ_dict in compo_activ_dict.items():
    if key.startswith('ocrx'):
        continue
    match_list = []
    lang = key.split('_')[1]
    for activ_ing, other_compo_list in activ_dict.items():
        match_list.append((other_compo_list,compo_activ_dict[f'ocrx_{lang}'].get(activ_ing,[])))
    compo_match_dict[key] = match_list

In [127]:
def other_strength_match(r_strength1,r_strength2):
    if r_strength1 is None or r_strength2 is None:
        return False
    strength1 = r_strength1.replace('/',' / ')
    strength2 = r_strength2.replace('/',' / ')
   
    limit = 2
    return all([tok1 == tok2 for tok1, tok2 in zip(strength1.split(' ')[:limit],strength2.split(' ')[:limit])])
        
#     return strength1.split(' ')[:2] == strength2.split(' ')[:2]

In [128]:
strength_match

<function __main__.strength_match(strength1, strength2)>

In [132]:
indiv_compo_match_dict = dict()
for key, match_list in compo_match_dict.items():
    el_list = list()
    for other_compos, ocrx_compos in match_list:
        for other_compo in other_compos:
#             for el in ocrx_compos:
#                 if other_strength_match(el.strength,other_compo.strength) != strength_match(el.strength,other_compo.strength):
#                     print(f"{el.strength} vs {other_compo.strength} {other_strength_match(el.strength,other_compo.strength)}")
            ocrx_matches = [el for el in ocrx_compos if other_strength_match(el.strength,other_compo.strength)]
            el_list.append((other_compo,ocrx_matches))
    indiv_compo_match_dict[key] = el_list

In [133]:
# [el for el in indiv_compo_match_dict['rxnorm_en'] if len(el[1]) > 0]

In [134]:
indiv_compo_match_dict['snomed_en'][0]
compo_preC_codes = dict()
for key, indiv_match_list in indiv_compo_match_dict.items():
    count_1_n = sum([len(el[1]) > 1 for el in indiv_match_list])
    count_1_1 = sum([len(el[1]) == 1 for el in indiv_match_list])
    all_codes = {el[0].code for el in indiv_match_list if len(el[1]) >= 1}
    compo_preC_codes[key] = all_codes
    count_1_0 = sum([len(el[1]) == 0 for el in indiv_match_list])
    print(f"{key}:1-n: {count_1_n} 1-1: {count_1_1}. 1-0: {count_1_0}")

snomed_en:1-n: 920 1-1: 1561. 1-0: 4167
dailymed_en:1-n: 964 1-1: 1475. 1-0: 5453
rxnorm_en:1-n: 1487 1-1: 2118. 1-0: 11851
ansm_fr:1-n: 528 1-1: 1079. 1-0: 10770


In [135]:
print('\n'.join(sorted(compo_activ_dict['ocrx_fr'].keys())))


(CHLORHYDRATE DE METFORMINE)
(CHLORHYDRATE DE MÉTOCLOPRAMIDE MONOHYDRATÉ)
(MALÉATE D'ACÉPROMAZINE
(MALÉATE D'ACÉPROMAZINE ((MALÉATE D'ACÉPROMAZINE)
(MORPHOLINOTHIO) BENZOTHIAZOLE
(S)-1-(3-MERCAPTO-2-MÉTHYL-1-OXOPROPYL)-L-PROLINE
(S)-1-(3-MERCAPTO-2-MÉTHYL-1-OXOPROPYL)-L-PROLINE ((S)-1-(3-MERCAPTO-2-MÉTHYL-1-OXOPROPYL)-L-PROLINE)
(SULFITE ACIDE DE MÉNADIONE SODIQUE)
1,2,3-PROPANETRIOL
1,2,3-PROPANETRIOL (1,2,3-PROPANETRIOL)
1,2-DIHYDROXYPROPANE
1,2-DIHYDROXYPROPANE (1,2-DIHYDROXYPROPANE)
1,3,7-TRIMÉTHYL-XANTHINE
1,3,7-TRIMÉTHYL-XANTHINE (1,3,7-TRIMÉTHYL-XANTHINE)
1-24-ACTH
1-24-ACTH (1-24-ACTH)
1-24-ACTH (HYDROXYDE DE ZINC TÉTRACOSACTIDE)
1-24-CORTICOTROPHINE
1-24-CORTICOTROPHINE (1-24-CORTICOTROPHINE)
1-24-CORTICOTROPHINE (HYDROXYDE DE ZINC TÉTRACOSACTIDE)
1-DÉAMINO-8-D-ARGININE VASOPRESSINE (ACÉTATE DE DESMOPRESSINE TRIHYDRATÉ)
1-DÉAMINO-8-D-ARGININE VASOPRESSINE (ACÉTATE DE DESMOPRESSINE)
1-DÉAMINO-8-D-ARGININE VASOPRESSINE (DESMOPRESSIN ACETATE)
1-DÉSAMINO-8-D-ARGININE VASOPRESSINE

In [136]:
print('\n'.join(sorted(compo_activ_dict['ansm_fr'].keys())))


(S)-LACTATE DE SODIUM
(S)-MALATE DE CABOZANTINIB
6-FLUORO-[18F]-L-DIHYDROXYPHÉNYLALANINE
6-IODOMÉTHYLNORCHOLESTÉROL [131 I]
ABACAVIR
ABACAVIR (SULFATE D')
ABATACEPT 
ABCIXIMAB (C7E3B FAB)
ABEILLE (VENIN D')
ABELMOSCHUS POUR PRÉPARATIONS HOMÉOPATHIQUES
ABIES CANADENSIS POUR PRÉPARATIONS HOMÉOPATHIQUES
ABIES NIGRA POUR PRÉPARATIONS HOMÉOPATHIQUES
ABIES PECTINATA POUR PRÉPARATIONS HOMÉOPATHIQUES
ABROTANUM POUR PRÉPARATIONS HOMÉOPATHIQUES
ABSINTHIUM POUR PRÉPARATIONS HOMÉOPATHIQUES
ABÉMACICLIB
ACALABRUTINIB
ACALYPHA INDICA POUR PRÉPARATIONS HOMÉOPATHIQUES
ACAMPROSATE CALCIQUE
ACARBOSE
ACETANILIDUM POUR PRÉPARATIONS HOMÉOPATHIQUES
ACETICUM ACIDUM POUR PRÉPARATIONS HOMÉOPATHIQUES
ACETONUM POUR PRÉPARATIONS HOMÉOPATHIQUES
ACICLOVIR
ACIDE 3,3-DIPHOSPHONO-1,2-PROPANEDICARBOXYLIQUE MONOHYDRATÉ (SEL TÉTRASODIQUE DE L')
ACIDE ACÉTIQUE GLACIAL
ACIDE ACÉTYLSALICYLIQUE
ACIDE ALPHA-CÉTOGLUTARIQUE
ACIDE ASCORBIQUE
ACIDE ASPARTIQUE
ACIDE AZÉLAÏQUE
ACIDE BORIQUE
ACIDE CARGLUMIQUE
ACIDE CHLORHYDRIQUE
ACI

In [137]:
(set(compo_activ_dict['ansm_fr'].keys()) - set(compo_activ_dict['ocrx_fr'].keys()))

{'ACÉTATE DE LEUPRORÉLINE',
 'DISPERSE BLUE 106',
 'ANATOXINE DIPHTÉRIQUE ',
 'EUCALYPTUS GLOBULUS POUR PRÉPARATIONS HOMÉOPATHIQUES',
 'OXALATE DE NALOXÉGOL',
 'MILLEPERTUIS (EXTRAIT SEC DE)',
 "OXALATE D'IVABRADINE",
 'TESTOSTÉRONE (UNDÉCANOATE DE)',
 'NORÉTHANDROLONE',
 'ISOPROPYLPHÉNYLPARAPHÉNYLÈNEDIAMINE',
 'ADONIS VERNALIS POUR PRÉPARATIONS HOMÉOPATHIQUES',
 'HORMONE FOLLICULO STIMULANTE ',
 'CHINA REGIA POUR PRÉPARATIONS HOMÉOPATHIQUES',
 'SULFUR POUR PRÉPARATIONS HOMÉOPATHIQUES',
 'CHLORURE DE MAGNÉSIUM HEXAHYDRATÉ',
 'DIHYDROGENOPHOSPHATE DE SODIUM DODECAHYDRATE',
 'MÉTHYLDOPA SESQUIHYDRATÉ',
 'OXYHYDROXYDE SUCROFERRIQUE',
 'MÉSILATE DE LOMITAPIDE',
 'TRASTUZUMAB EMTANSINE ',
 'DISOPYRAMIDE (PHOSPHATE DE)',
 "NITRATE D'OMOCONAZOLE",
 'CHLORMADINONE (ACÉTATE DE)',
 'BISMUTH (SOUS-NITRATE DE) LOURD POUR PRÉPARATIONS HOMÉOPATHIQUES',
 'PETROLEUM POUR PRÉPARATIONS HOMÉOPATHIQUES',
 'SEDUM ACRE POUR PRÉPARATIONS HOMÉOPATHIQUES',
 "AUBÉPINE (EXTRAIT D') (FLUIDE)",
 'FRAGARIA VESCA PO

In [138]:
len(compo_activ_dict['ansm_fr'])

3917

In [95]:
def is_valid_compo(el):
    return el['active_ingredient'] is not None and el['strength'] is not None
mega_align_compo_stats_dict = dict()

for key, raw_compo_list in mega_dict_compo.items():
    if key.startswith('ocrx'):
        continue
    lang = key.split('_')[1]
#     import ipdb; ipdb.set_trace()
    compo_list = [compo for compo in raw_compo_list if compo.code not in compo_preC_codes[key]]
    all_aligned = [(compo.align_to_database(lang=lang), compo) for compo in compo_list]
    correct_aligned = {el[1].code for el in all_aligned if is_valid_compo(el[0])}
#     aligned_codes = {el[1].code for el in correct_aligned}
#     compo_preC_codes[key] = aligned_codes
    not_correct_aligned = {el[1].code for el in all_aligned if not is_valid_compo(el[0])}
    print(f"{key}: {len(correct_aligned)} vs {len(not_correct_aligned)}")
    mega_align_compo_stats_dict[key] = (correct_aligned, not_correct_aligned)

snomed_en: 2544 vs 1413
dailymed_en: 2469 vs 2238
rxnorm_en: 4561 vs 6297
ansm_fr: 1807 vs 8435


In [66]:
mega_align_compo_stats_dict['ansm_fr']

([({'active_ingredient': ('CHLORHYDRATE DE TRAMADOL',
     'CHLORHYDRATE DE TRAMADOL',
     'http://www.ocrx.ca/OCRx/3000001260'),
    'strength': ('200 MG / UNE GÉLULE',
     '200 MG',
     'http://www.ocrx.ca/OCRx/S1663')},
   Strength: 200 MG / UNE GÉLULE. Active ingredient: CHLORHYDRATE DE TRAMADOL),
  ({'active_ingredient': ('FUMARATE DE QUÉTIAPINE',
     'FUMARATE DE QUÉTIAPINE',
     'http://www.ocrx.ca/OCRx/3000003807'),
    'strength': ('50 MG / UN COMPRIMÉ',
     '50 MG',
     'http://www.ocrx.ca/OCRx/S8205')},
   Strength: 50 MG / UN COMPRIMÉ. Active ingredient: FUMARATE DE QUÉTIAPINE),
  ({'active_ingredient': ('PARACÉTAMOL',
     'PARACÉTAMOL',
     'http://www.ocrx.ca/OCRx/3000004395'),
    'strength': ('10 MG / 1 ML', '10 MG', 'http://www.ocrx.ca/OCRx/S10584')},
   Strength: 10 MG / 1 ML. Active ingredient: PARACÉTAMOL),
  ({'active_ingredient': ('CHLORHYDRATE DE TÉTRACYCLINE',
     'CHLORHYDRATE DE TÉTRACYCLINE',
     'http://www.ocrx.ca/OCRx/3000001748'),
    'strength