In [26]:
import sys
import os
sys.path.insert(0, os.path.abspath('/Library/Frameworks/Python.framework/Versions/3.6/lib/python3.6/site-packages'))
#sys.path


## Match all excat terms
1. unflatten other names of each disease and combine with disease name
2. for each disease, check if any name matches the icd10 name of icd10 mapping
3. keep track of how may codes are matched
4. add a column of phecode in the drug-disease.csv file

In [2]:
import pandas as pd
import numpy as np

df = pd.read_csv("drugs.com/drug_disease.csv")
df.columns


Index(['Unnamed: 0', 'disease name', 'other names', 'RX/OTC', 'drug',
       'generic name', 'drug class', 'brand names', 'pregnacy label',
       'CSA label'],
      dtype='object')

In [3]:
import ast
import numpy as np

all_names = []

other_names = df["other names"].apply(lambda x: x.lower()).to_list()

name = df["disease name"].apply(lambda x: x.lower()).to_list()

for i in range((len(name))):
    new_line = []
    new_line.append(name[i])
    other_names_list = ast.literal_eval(other_names[i])
    cleaned_other_names = []
    if other_names_list:
        other = other_names_list[0].split(';')
        for o in other:
            o = o.lstrip().rstrip().replace(",", "")
            cleaned_other_names.append(o)
    new_line += cleaned_other_names
    all_names.append(new_line)
    
all_names = list(np.unique(np.array(all_names)))
print(len(all_names))
all_names[:20]


1508


[['abdominal distension', 'abdominal bloating', 'bloating', 'meteorism'],
 ['abdominal distension prior to abdominal x ray'],
 ['abdominal radiological procedure', 'abdominal radiological procedures'],
 ['abnormal uterine bleeding',
  'bleeding dysfunctional uterine',
  'dub',
  'dysfunctional uterine bleeding'],
 ['abortion', 'complete abortion'],
 ['acetaminophen overdose', 'acetaminophen toxicity'],
 ['acidosis', 'acidosis lactic', 'acidosis respiratory'],
 ['acne',
  'acne vulgaris',
  'blackheads',
  'breakouts',
  'cystic acne',
  'pimples',
  'whiteheads',
  'zits'],
 ['acne rosacea', 'acne rosacea'],
 ['acromegaly'],
 ['actinic keratosis'],
 ['actinomycosis', 'lumpy jaw'],
 ['acute coronary syndrome'],
 ['acute coronary syndrome prophylaxis'],
 ['acute gout'],
 ['acute lymphoblastic leukemia'],
 ['acute lymphocytic leukemia',
  'acute childhood leukemia',
  'all leukemia',
  'cancer acute childhood leukemia',
  'cancer acute lymphocytic leukemia',
  'leukemia acute childhood',


In [13]:
# create a dictionary from icd code
icd_df = pd.read_csv("mapping data/icd_merged_result.csv", encoding = "latin", dtype="str")
icd_df[:10]

Unnamed: 0,icd10cm,icd10cm_str,phecode,phecode_str,version
0,1.0,Cholera,8.0,Intestinal infection,9
1,1.0,Cholera due to Vibrio cholerae,8.0,Intestinal infection,9
2,1.1,Cholera due to Vibrio cholerae el tor,8.0,Intestinal infection,9
3,1.9,Cholera NOS,8.0,Intestinal infection,9
4,2.0,Typhoid and paratyphoid fevers,8.0,Intestinal infection,9
5,2.0,Typhoid fever,8.5,Bacterial enteritis,9
6,2.1,Paratyphoid fever A,8.0,Intestinal infection,9
7,2.2,Paratyphoid fever B,8.0,Intestinal infection,9
8,2.3,Paratyphoid fever C,8.0,Intestinal infection,9
9,2.9,Paratyphoid fever NOS,8.0,Intestinal infection,9


In [14]:
icd10_str = icd_df["icd10cm_str"].apply(lambda x: x.lower()).to_list()
phecode = icd_df["phecode"].to_list()
icd_phecode_dict = dict(zip(icd10_str, phecode)) 

icd_phecode_dict["cholera"]

'008'

In [15]:
def _unique(l):
    return list(np.unique(np.array(l)))

In [16]:
mapped_num = 0
# map our data
phecodes = []
i = 0 
for disease in all_names:
    cur_phecodes = []
    for name in disease:
        if name in icd_phecode_dict:
             cur_phecodes.append(icd_phecode_dict[name])
    cur_phecodes = _unique(cur_phecodes)
    
    if len(cur_phecodes) > 0:
        mapped_num += 1
    if len(cur_phecodes) >1:
        print(i)
        
    phecodes.append(cur_phecodes)
    i += 1


39
50
76
95
117
262
436
494
523
548
574
687
885
1036
1085
1179
1290
1358


In [17]:
all_names[574]

['headache', 'tension headache']

In [18]:
icd_phecode_dict['headache']

'339'

In [19]:
icd_phecode_dict['tension headache']

'306.9'

In [20]:
phecodes[:50]

[[],
 [],
 [],
 [],
 [],
 [],
 ['276.41'],
 ['706.1'],
 [],
 [],
 ['702.1'],
 ['041'],
 [],
 [],
 [],
 [],
 [],
 [],
 ['204.21'],
 [],
 [],
 [],
 ['204.21'],
 [],
 ['255.21'],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 ['300.12'],
 [],
 ['317'],
 ['290.2', '317.1'],
 ['317.11'],
 [],
 ['276.42'],
 [],
 [],
 [],
 [],
 ['476'],
 ['947'],
 []]

In [21]:
mapped_percentage = mapped_num/len(all_names)
mapped_percentage

0.2625994694960212

## Fuzzy Matching
1. Split each string of disease (term) in the icd-10 map and in my csv (to lists)
    Splitting needed because 'type a' should be very different from 'type b'
2. For each string in each list, Stem / lemmatize it
3. Map the csv list with the icd-10 list by individual strings, calculate a score for each mapping string, add them up to be the score of a term
4. set a threshold, and adopt mappings w/ scores above that threshold.
5. quality control: check with excat match to see what percentage is different.

In [22]:
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize
ps = PorterStemmer()

In [23]:
all_names_stemmed = []
for disease in all_names:
    disease_split = []
    for names in disease:
        name_list = names.split(" ")

        # stem them
        stemmed_name_list = []
        for name in name_list:
            stemmed_name_list.append(ps.stem(name))
        disease_split.append(" ".join(stemmed_name_list))

    all_names_stemmed.append(disease_split)

all_names_stemmed[:10]

[['abdomin distens', 'abdomin bloat', 'bloat', 'meteor'],
 ['abdomin distens prior to abdomin x ray'],
 ['abdomin radiolog procedur', 'abdomin radiolog procedur'],
 ['abnorm uterin bleed',
  'bleed dysfunct uterin',
  'dub',
  'dysfunct uterin bleed'],
 ['abort', 'complet abort'],
 ['acetaminophen overdos', 'acetaminophen toxic'],
 ['acidosi', 'acidosi lactic', 'acidosi respiratori'],
 ['acn',
  'acn vulgari',
  'blackhead',
  'breakout',
  'cystic acn',
  'pimpl',
  'whitehead',
  'zit'],
 ['acn rosacea', 'acn rosacea'],
 ['acromegali']]

In [24]:
icd10_str_stemmed = []
for disease in icd10_str:
    disease_split = disease.split(" ")
    stemmed_disease_split = []
    for d in disease_split:
        stemmed_disease_split.append(ps.stem(d))
    icd10_str_stemmed.append(" ".join(disease_split))
icd10_str_stemmed[:10]

['cholera',
 'cholera due to vibrio cholerae',
 'cholera due to vibrio cholerae el tor',
 'cholera nos',
 'typhoid and paratyphoid fevers',
 'typhoid fever',
 'paratyphoid fever a',
 'paratyphoid fever b',
 'paratyphoid fever c',
 'paratyphoid fever nos']

### The approach would not work in that it still could not differentiate  'paratyphoid fever a' from 'paratyphoid fever b'.
It might match more terms, but would allow too much potential error.
In general, levenshtein distance calculated fuzzy match would not work.

In [27]:
from fuzzywuzzy import fuzz
from fuzzywuzzy import process

fuzz.ratio('cholera nos', "cholera")



78

In [28]:
fuzz.ratio('paratyphoid fever a', 'paratyphoid fever b')



95

### Check stemming

In [29]:
icd_phecode_dict_stemmed = dict(zip(icd10_str_stemmed, phecode))


In [30]:
icd_phecode_dict_stemmed["headache"]

'339'

In [31]:
mapped_num_stemmed = 0
# map our data
phecodes_stemmed = []
i = 0
for disease in all_names_stemmed:
    cur_phecodes = []
    for name in disease:
        if name in icd_phecode_dict_stemmed:
             cur_phecodes.append(icd_phecode_dict_stemmed[name])
    cur_phecodes = _unique(cur_phecodes)

    if len(cur_phecodes) > 0:
        mapped_num_stemmed += 1
    if len(cur_phecodes) >1:
        print(i)
        phecodes_stemmed
    phecodes.append(cur_phecodes)
    i += 1


50


In [32]:
mapped_percentage_stemmed = mapped_num/len(all_names_stemmed)
mapped_percentage_stemmed



0.2625994694960212