In [8]:
from tqdm.notebook import tqdm
import numpy as np
import pandas as pd
import sys
sys.path.append('../')
from utils import *
from Levenshtein import ratio, distance #https://rawgit.com/ztane/python-Levenshtein/master/docs/Levenshtein.html
from sklearn.metrics.pairwise import cosine_similarity

from IPython.display import display
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
%config InlineBackend.figure_format = 'retina'

def set_style():
    plt.style.use(['seaborn-white', 'seaborn-paper'])
    matplotlib.rc("font", family="Arial")
set_style()

%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [9]:
kg = read_kg('../knowledge_graph/kg_v6_raw.csv')
nodes = get_nodes(kg)

In [10]:
dis = nodes.query('node_type == "disease"').drop_duplicates().reset_index().drop('index',axis=1)

### UMLS 

In [75]:
mondo_xref = read_csv('../knowledge_graph/databases/mondo/mondo_references')
umls_disease = read_csv('../data/umls_def_disease')
umls_disease = pd.merge(mondo_xref, umls_disease, 'inner', left_on='ontology_id', right_on='CUI')
umls_disease = umls_disease.get(['mondo_id', 'description'])
umls_disorder = read_csv('../data/umls_def_disorder')
umls_disorder = pd.merge(mondo_xref, umls_disorder, 'inner', left_on='ontology_id', right_on='CUI')
umls_disorder = umls_disorder.get(['mondo_id', 'description'])
umls_features = pd.concat([umls_disease,umls_disorder]).drop_duplicates().astype('str')
display(umls_features.head())

Unnamed: 0,mondo_id,description
0,1,A definite pathologic process with a character...
1,1,"In medicine, a health problem with certain cha..."
2,1,Any abnormal condition of the body or mind tha...
3,1,Any deviation from or interruption of the norm...
4,1,top term heading for all specific disorders an...


### Mayo

In [123]:
mayo_data = read_csv('../data/mayo')
mayo_data.loc[:, 'mayo_name_lower'] = [x.lower() for x in mayo_data.get('name').values]
mayo_data.head(1)

Unnamed: 0,name,link,Symptoms,Causes,Risk factors,Complications,Prevention,mayo_name_lower
0,Dandruff,https://www.mayoclinic.org/diseases-conditions/dandruff/symptoms-causes/syc-20353850,"Dandruff signs and symptoms may include:\nSkin flakes on your scalp, hair, eyebrows, beard or mustache, and shoulders\nItchy scalp\nScaly, crusty scalp in infants with cradle cap\nThe signs and symptoms may be more severe if you're stressed, and they tend to flare in cold, dry seasons.\nWhen to see a doctor\nMost cases of dandruff don't require a doctor's care. See your primary care doctor or a doctor who specializes in skin conditions (dermatologist) if your condition doesn’t improve with regular use of over-the-counter dandruff shampoo.","Dandruff may have several causes, including:\nIrritated, oily skin\nNot shampooing enough\nA yeastlike fungus (malassezia) that feeds on oils on the scalps of most adults\nDry skin\nSensitivity to hair care products (contact dermatitis)\nOther skin conditions, such as psoriasis and eczema","Almost anyone can have dandruff, but certain factors can make you more susceptible:\nAge. Dandruff usually begins in young adulthood and continues through middle age. That doesn't mean older adults don't get dandruff. For some people, the problem can be lifelong.\nBeing male. Because more men have dandruff, some researchers think male hormones may play a role.\nCertain illnesses. Parkinson's disease and other diseases that affect the nervous system also seem to increase risk of dandruff. So does having HIV or a weakened immune system.",,,dandruff


In [124]:
mayo_map = dis.copy().get(['node_id','node_name'])
mayo_map.loc[:, 'node_name_lower'] = [x.lower() for x in mayo_map.get('node_name').values]
mayo_map.head(1)

Unnamed: 0,node_id,node_name,node_name_lower
0,5345,hypospadias (disease),hypospadias (disease)


In [13]:
mayo_names = mayo_data.get('mayo_name_lower').values
node_names = mayo_map.get('node_name_lower').values

exact, m_in_kg, kg_in_m = [], [], []

for mn in mayo_names: 
    for nn in node_names: 
        if mn == nn: 
            exact.append((mn,nn))
        elif nn.count(mn): # mayo fully in kg
            m_in_kg.append((mn,nn))
        elif mn.count(nn): # kg fully in mayo 
            if mn.count(', also known as '): 
                mn1, mn2 = mn.split(', also known as ')
                if mn1 == nn or mn2 == nn: # exact match 
                    exact.append((mn,nn))
                elif nn.count(mn1) or nn.count(mn2): # mayo fully in kg
                    m_in_kg.append((mn,nn))
                else: # kg fully in mayo 
                    kg_in_m.append((mn,nn))
            else:    
                kg_in_m.append((mn,nn))

In [14]:
exact = pd.DataFrame(exact, columns=['mayo_name_lower','node_name_lower']).drop_duplicates()
encapsulated = pd.DataFrame(m_in_kg, columns=['mayo_name_lower','node_name_lower']).drop_duplicates()

In [15]:
found = pd.concat([exact, encapsulated], ignore_index=True).drop_duplicates()
remaining = set(list(mayo_names)) - set(list(found.get(['mayo_name_lower']).drop_duplicates().values.reshape(-1)))
kg_unmatched = set(list(node_names)) - set(list(found.get(['node_name_lower']).drop_duplicates().values.reshape(-1)))
len(remaining)

1208

In [16]:
review = pd.DataFrame(kg_in_m, columns=['mayo_name_lower','node_name_lower']).drop_duplicates()

#pd.set_option('display.max_rows', None)
#pd.set_option('display.max_colwidth', None) 
#ignore = set(['lap','injury','noma','congenital','n syndrome', 'inherited', 'c syndrome', 'h syndrome'])
#for _, x in review.query('node_name_lower not in @ignore').query('mayo_name_lower in @remaining and node_name_lower in @kg_unmatched').groupby('mayo_name_lower'): 
#    display(x)

ok_idx = [58, 63, 146, 145, 152, 310, 447, 451, 454, 705, 707, 708, 168, 345, 355, 509, 513, 215, 
          224, 381, 526, 536, 535, 538, 541, 670, 671, 679, 790, 791, 793, 795, 873, 909, 949, 5, 14, 15, 20, 
          22, 27, 33, 35, 36, 40, 515, 516, 517, 519, 658, 659, 
          660, 662, 664, 775, 776, 765, 858, 621, 624, 628, 480, 752, 757, 907, 937, 
          943, 471, 475, 479, 597, 599, 601, 739, 842, 241, 243, 383, 335, 493, 496, 631,
          635, 636, 638, 640, 397, 177, 180, 522, 321, 322, 464, 468, 716, 721, 722, 724, 
          727, 729, 827, 830, 834, 895, 959, 308, 438, 439, 441, 129, 133, 286, 424, 426, 
          427, 562, 564, 568, 690, 698, 700, 804, 884, 122, 283, 284, 557, 66, 107, 52]

found = pd.concat([found, review.loc[ok_idx, :]], ignore_index=True)
remaining = set(list(mayo_names)) - set(list(found.get(['mayo_name_lower']).drop_duplicates().values.reshape(-1)))

#ignore = set(['lap','injury','noma','congenital','n syndrome','arteritis', 'arthritis',
#              'inherited', 'c syndrome', 'h syndrome','cancer'])
#for _, x in review.query('node_name_lower not in @ignore').query('mayo_name_lower in @remaining').groupby('mayo_name_lower'): 
#    display(x)

also_ok = [55, 150, 580, 711, 713, 820, 821, 822, 170, 171, 351, 507, 874, 913,
          926, 945, 946, 963, 666, 665, 778, 864, 645, 649, 650, 654, 656, 766, 
          768, 859, 860, 862, 620, 612, 614, 616, 617, 745, 851, 856, 899, 902, 
          942, 472, 605, 607, 740, 458, 549, 336, 340, 342, 254, 394, 368, 372, 
          373, 330, 327, 463, 465, 718, 730, 835, 893, 930, 302, 304, 139, 421, 
          428, 432, 430, 434, 563, 571, 693, 701, 801, 876, 808, 880, 881, 127, 
          277, 278, 409, 410, 420, 116, 29, 30, 31, 32]
found = pd.concat([found, review.loc[also_ok, :]], ignore_index=True).drop_duplicates()
remaining = set(list(mayo_names)) - set(list(found.get(['mayo_name_lower']).drop_duplicates().values.reshape(-1)))
kg_unmatched = set(list(node_names)) - set(list(found.get(['node_name_lower']).drop_duplicates().values.reshape(-1)))
len(remaining)

989

In [52]:
'''
# get BERT embeddings

from transformers import AutoTokenizer, AutoModel, pipeline
from bert.biobert import BioBERT

bert = BioBERT()

def save_embeds(inputs, file): 
    embeds = []
    for i in tqdm(range(len(inputs))):   
        e = bert.get_features(inputs[i])
        embeds.append(e)
    np.save(file, np.concatenate(embeds))

save_embeds(mayo_data.get('name').values, 'mayo_bert_embeds.npy')
save_embeds(mayo_map.get('node_name').values, 'kg_bert_embeds.npy')


# get cosine similarity between bert embeddings 
mayo_bert = np.load('mayo_bert_embeds.npy')
kg_bert = np.load('kg_bert_embeds.npy')
cos_sim = cosine_similarity(mayo_bert, kg_bert)

K = 10
matches = []
for i, mn in enumerate(mayo_names): 
    if mn in remaining:
        s = []
        for j, nn in enumerate(node_names): 
            s.append((mn, nn, round(ratio(mn,nn), 2), distance(mn,nn), round(cos_sim[i][j],2)))
        best = []
        best.extend(sorted(s, key = lambda x: x[2], reverse=True)[:K])
        best.extend(sorted(s, key = lambda x: x[3], reverse=False)[:K])
        best.extend(sorted(s, key = lambda x: x[4], reverse=True)[:K])
        best = list(set(best))
        matches.extend(best)
matches = pd.DataFrame(matches, columns=['mayo_name_lower','node_name_lower','edit_similarity_ratio', 'levenshtein_distance', 'bert_cosine_similarity'])
matches.to_csv('matches.csv')
'''
matches = read_csv('matches')

In [108]:
ok = []
ok.extend(matches.query('bert_cosine_similarity > 0.98').index.values)
found = pd.concat([found, matches.get(['mayo_name_lower', 'node_name_lower']).loc[ok, :]], ignore_index=True).drop_duplicates()
remaining = set(list(mayo_names)) - set(list(found.get(['mayo_name_lower']).drop_duplicates().values.reshape(-1)))

ok = list(matches.query('mayo_name_lower in @remaining')\
.query('bert_cosine_similarity<=0.98 and bert_cosine_similarity>0.95')\
.query('edit_similarity_ratio>0.88').index.values)
found = pd.concat([found, matches.get(['mayo_name_lower', 'node_name_lower']).loc[ok, :]], ignore_index=True).drop_duplicates()
remaining = set(list(mayo_names)) - set(list(found.get(['mayo_name_lower']).drop_duplicates().values.reshape(-1)))

ok = [3566, 3911, 4976, 5195, 8612, 9059, 11226, 16646, 17790, 21330, 21338, 22313,
      277, 1356, 1438, 7811, 7939, 9943, 11487, 11826, 12734, 21471, 4978, 7203, 9160, 
      9919, 10562, 13868, 14709, 16408, 22253, 576, 2419, 3700, 4539, 9104, 11218, 18025, 
      20407, 22398, 22655, 8041, 9782, 13257, 16653, 16655, 16656, 16658, 16659, 16665, 
      16670, 19757, 20887, 23195, 11711, 16335, 17532, 1190, 1301, 2936, 3130, 4647, 4839,
      8314, 10100, 10382, 18228, 19794, 21189, 21734, 21937, 15024, 17689, 18106, 17040, 
      18827, 14786, 14362, 14208, 5032, 5024, ]
found = pd.concat([found, matches.get(['mayo_name_lower', 'node_name_lower']).loc[ok, :]], ignore_index=True).drop_duplicates()
remaining = set(list(mayo_names)) - set(list(found.get(['mayo_name_lower']).drop_duplicates().values.reshape(-1)))

ok = []

len(remaining)

885

In [163]:
mayo_bert = np.load('mayo_bert_embeds.npy')
x = cosine_similarity(mayo_bert)
mayo_sim = []
for i, n1 in enumerate(mayo_names): 
    for j, n2 in enumerate(mayo_names): 
        mayo_sim.append((n1,n2,x[i][j]))
mayo_sim = pd.DataFrame(mayo_sim, columns=['mayo1','mayo2','bert_cosine_similarity'])
mayo_sim_match = mayo_sim.query('mayo1 in @remaining').query('mayo2 not in @remaining').query('mayo1 != mayo2')\
.query('not mayo1.str.contains("allergy")').sort_values('bert_cosine_similarity', ascending=False).reset_index().drop('index',axis=1)
ignore = [32,33,35,36,39,40,42,45,47,52,54,61,63,64,65,66,68,69,70,71,72,74,75]
good_idx = list(set(list(range(76)))-set(ignore))
mayo_sim_match_found = mayo_sim_match.query('bert_cosine_similarity >0.97').loc[good_idx, :]
new = []
for _, m1, m2, _ in mayo_sim_match_found.itertuples(): 
    # m1 remaining 
    # m2 known
    for nn in found.query('mayo_name_lower == @m2').get('node_name_lower').values: 
        new.append((m1,nn))
new = pd.DataFrame(new, columns=found.columns)
found = pd.concat([found, new], ignore_index=True).drop_duplicates()
remaining = set(list(mayo_names)) - set(list(found.get(['mayo_name_lower']).drop_duplicates().values.reshape(-1)))

mayo_sim_match2 = mayo_sim.query('mayo1 in @remaining').query('mayo2 not in @remaining').query('mayo1 != mayo2')\
.query('not mayo1.str.contains("allergy")').query('not mayo1.str.contains("polyps")').query('not mayo1.str.contains("cancer")')\
.sort_values('bert_cosine_similarity', ascending=False).reset_index().drop('index',axis=1)
good_idx = [0,1,2,7,8,9,12,15,20,25,29,42,41,46,58,59,79,82,92,104,105,106,108,111,
            113,133,134,154,156,157,158,165]
mayo_sim_match_found2 = mayo_sim_match2.loc[good_idx, :]
new = []
for _, m1, m2, _ in mayo_sim_match_found2.itertuples(): 
    # m1 remaining 
    # m2 known
    for nn in found.query('mayo_name_lower == @m2').get('node_name_lower').values: 
        new.append((m1,nn))
new = pd.DataFrame(new, columns=found.columns)
found = pd.concat([found, new], ignore_index=True).drop_duplicates()

wrong = ['coma', 'milia', 'ten', 'lice', 'edema', 'gout']
found = pd.concat([found.query('mayo_name_lower not in @wrong'), found.query('mayo_name_lower==node_name_lower')], ignore_index=True).drop_duplicates()

In [68]:
pd.set_option('display.max_rows', None)
pd.set_option('display.max_colwidth', None)

review1 = []
for mn in mayo_names:
    for nn in node_names:
        if '(' in nn:
            nn1 = nn.split(' (')[0]
            if mn == nn1: 
                review1.append((mn,nn))
            #elif nn1.count(mn): # mayo fully in kg
            #    review1.append((mn,nn))
            elif mn.count(' '+nn1): # kg fully in mayo 
                review1.append((mn,nn))


review1 = pd.DataFrame(review1, columns=['mayo_name_lower','node_name_lower']).drop_duplicates()

In [73]:
ignore = ['leukemia (disease)', 'colitis (disease)']
ok = review1.query('mayo_name_lower in @remaining').drop_duplicates().query('node_name_lower not in @ignore')

found = pd.concat([found, ok], ignore_index=True).drop_duplicates()

In [98]:
review2 = []
for mn in mayo_names:
    x = mn.split(',')
    if len(x)==2: 
        mn1 = x[0]
        if len(mn1)>4:
            for nn in node_names:
                if len(nn)>4:
                    a = mn1
                    b = nn
                    if a==b or a.count(b) or b.count(a):
                        review2.append((mn,nn))
review2 = pd.DataFrame(review2, columns=['mayo_name_lower','node_name_lower']).drop_duplicates()
good_idx = [10,63,71,83,103,118,138,170,264,298,305,441,509,561,569,588,630,674,
           702,711,729,747,997,1000,1020,1120,1149]


found = pd.concat([found, review2.loc[good_idx,:]], ignore_index=True).drop_duplicates()

In [None]:
found.to_csv('found.csv')

In [116]:
found = read_csv('found')
remaining = set(list(mayo_names)) - set(list(found.get(['mayo_name_lower']).drop_duplicates().values.reshape(-1)))
len(remaining)
1 - len(remaining)/len(set(list(mayo_names)))

0.669510552312528

In [125]:
mayo_map.columns

Index(['node_id', 'node_name', 'node_name_lower'], dtype='object')

In [142]:
mayo_kg_map = pd.merge(pd.merge(found, mayo_map, on='node_name_lower'), 
         mayo_data.get(['name','mayo_name_lower']), on='mayo_name_lower').get(['node_id','node_name','name'])\
        .rename(columns={'name':'mayo_name'})

mayo_kg_map.to_csv('mayo_kg_map_raw.csv')

In [143]:
# substitute grouped disease nodes in raw nodes 
mayo_kg_map.head()

Unnamed: 0,node_id,node_name,mayo_name
0,21107,narcolepsy,"Daytime sleepiness, also known as Narcolepsy"
1,21107,narcolepsy,Narcolepsy
2,16158,narcolepsy-cataplexy syndrome,Narcolepsy
3,8062,narcolepsy 1,Narcolepsy
4,9118,disseminated sclerosis with narcolepsy,Narcolepsy


In [157]:
grouped_diseases = read_csv('../knowledge_graph/grouped_diseases').astype(str)
grouped_diseases.head()

Unnamed: 0,node_id,node_type,node_name,node_source,group_num,group_id,group_name
0,7254,disease,breast cancer,MONDO,0,7254_21100_3582_4989,breast cancer\breast neoplasm\hereditary breast ovarian cancer syndrome\breast carcinoma
1,16419,disease,hereditary breast carcinoma,MONDO,1,16419_6244_4379_4953,hereditary breast carcinoma\HER2 positive breast carcinoma\female breast carcinoma\invasive ductal breast carcinoma
2,21100,disease,breast neoplasm,MONDO,0,7254_21100_3582_4989,breast cancer\breast neoplasm\hereditary breast ovarian cancer syndrome\breast carcinoma
3,3582,disease,hereditary breast ovarian cancer syndrome,MONDO,0,7254_21100_3582_4989,breast cancer\breast neoplasm\hereditary breast ovarian cancer syndrome\breast carcinoma
4,16266,disease,squamous cell carcinoma of the corpus uteri,MONDO,2,16266_16267,squamous cell carcinoma of the corpus uteri\undifferentiated carcinoma of the corpus uteri


In [178]:
mayo_kg_map_group = mayo_kg_map.copy()

for data in mayo_kg_map.itertuples():
    x = grouped_diseases.query('node_id==@data.node_id and node_name==@data.node_name')
    if not x.empty:
        x = x.reset_index()
        mayo_kg_map_group.loc[data.Index, 'node_id'] = x.loc[0, 'group_id']
        mayo_kg_map_group.loc[data.Index, 'node_name'] = x.loc[0, 'group_name']
    x = pd.DataFrame()

In [183]:
mayo_kg_map_group = mayo_kg_map_group.drop_duplicates()
mayo_kg_map_group.to_csv('mayo_kg_map_group.csv')

In [186]:
mayo_kg_map_group.get(['node_id']).drop_duplicates().shape[0]

5595

In [187]:
mayo_kg_map_group.get(['mayo_name']).drop_duplicates().shape[0]

1491