Get GARD Dictionary

In [1]:
## Section: GARD SEARCH
# can identify rare diseases in text using the GARD dictionary from neo4j
# and map a GARD ID, name, or synonym to all of the related synonyms for searching APIs
from typing import List, Dict, Union, Optional, Set, Tuple
from nltk import tokenize as nltk_tokenize
import requests

class GARD_Search:
    def __init__(self):
        import json, codecs
        #These are opened locally so that garbage collection removes them from memory
        try:
            with codecs.open('gard-id-name-synonyms.json', 'r', 'utf-8-sig') as f:
                diseases = json.load(f)
        except:
            r = requests.get('https://raw.githubusercontent.com/ncats/epi4GARD/master/EpiExtract4GARD/gard-id-name-synonyms.json')
            diseases = json.loads(r.content)
        
        from nltk.corpus import stopwords
        try:
            STOPWORDS = set(stopwords.words('english'))
        except:
            import nltk
            nltk.download('stopwords')
            STOPWORDS = set(stopwords.words('english'))
        
        #This should be a list of all GARD IDs for purposes like random choice for testing
        GARD_id_list = [entry['gard_id'] for entry in diseases]
        #keys are going to be disease names, values are going to be the GARD ID, set up this way bc dictionaries are faster lookup than lists
        GARD_dict = {}
        #Find out what the length of the longest disease name sequence is, of all names and synonyms. This is used by get_diseases
        max_length = -1
        for entry in diseases:
            if entry['name'] not in GARD_dict.keys():
                s = entry['name'].lower().strip()
                if s not in STOPWORDS and len(s)>5:
                    GARD_dict[s] = entry['gard_id']
                    #compare length
                    max_length = max(max_length,len(s.split()))

            if entry['synonyms']:
                for synonym in entry['synonyms']:
                    if synonym not in GARD_dict.keys():
                        s = synonym.lower().strip()
                        if s not in STOPWORDS and len(s)>5:
                            GARD_dict[s] = entry['gard_id']
                            max_length = max(max_length,len(s.split()))
                            
        self.GARD_id_list = GARD_id_list
        self.GARD_dict = GARD_dict
        self.max_length = max_length
    
    def __str__(self) -> str:
        return '''Instantiation: rd_identify = GARD_Search()
                  Calling: diseases, ids = rd_identify(text) 
                  Autosearch or get disease names from GARD ID: search_terms = rd_identify.autosearch(searchterm)
                  GARD ID List: rd_identify.GARD_id_list
                  Disease Dictionary: rd_identify.GARD_dict
               '''
    
    def __call__(self, sentence:str) -> Tuple[List[str], List[str]]:
        return self.get_diseases(sentence)
    
    #Works much faster if broken down into sentences.
    #compares every phrase in a sentence to see if it matches anything in the GARD dictionary of diseases.
    def get_diseases(self, sentence:str) -> Tuple[List[str], List[str]]:   
        tokens = [s.lower().strip() for s in nltk_tokenize.word_tokenize(sentence)]
        diseases = []
        ids = []
        i=0
        #Iterates through every word, builds string that is max_length or less to compare.
        while i <len(tokens):
            #Find out the length of the comparison string, either max_length or less. This brings algorithm from O(n^2) to O(n) time
            compare_length = min(len(tokens)-i, self.max_length)

            #Compares longest sequences first and goes down until there is a match
            #print('(start compare_length)',compare_length)
            while compare_length>0:
                s = ' '.join(tokens[i:i+compare_length])
                if s.lower() in self.GARD_dict.keys():
                    diseases.append(s)
                    ids.append(self.GARD_dict[s.lower()])
                    #Need to skip over the next few indexes
                    i+=compare_length-1
                    break
                else:
                    compare_length-=1
            i+=1
        return diseases,ids
    
    #Given a GARD ID of form 'GARD:0000001', this will return all of the names of the disease
    def get_names_from_id(self, gard_id):
        return [k for k,v in self.GARD_dict.items() if v==gard_id]
    
    #Can search by 7-digit GARD_ID, 12-digit "GARD:{GARD_ID}", matched search term, or arbitrary search term
    #Returns list of terms to search by
    # search_term_list = autosearch(search_term, GARD_dict)
    def autosearch(self, searchterm:Union[str,int], matching=2) -> List[str]:
        #comparisons below only handly strings, allows int input
        if type(searchterm) is not str:
            searchterm = str(searchterm)

        #for the disease names to match
        searchterm = searchterm.lower()

        while matching>=1:
            #search in form of 'GARD:0000001'
            if 'gard:' in searchterm and len(searchterm)==12:
                searchterm = searchterm.replace('gard:','GARD:')
                l = self.get_names_from_id(searchterm)
                l.sort(reverse=True, key=lambda x:len(x))
                if len(l)>0:
                    print("SEARCH TERM MATCHED TO GARD DICTIONARY. SEARCHING FOR: ",l)
                    return l

            #can take int or str of digits of variable input
            #search in form of 777 or '777' or '00777' or '0000777'
            elif searchterm[0].isdigit() and searchterm[-1].isdigit():
                if len(searchterm)>7:
                    raise ValueError('GARD ID IS NOT VALID. RE-ENTER SEARCH TERM')
                searchterm = 'GARD:'+'0'*(7-len(str(searchterm)))+str(searchterm)
                l = self.get_names_from_id(searchterm)
                l.sort(reverse=True, key=lambda x:len(x))
                if len(l)>0:
                    print("SEARCH TERM MATCHED TO GARD DICTIONARY. SEARCHING FOR: ",l)
                    return l

            #search in form of 'mackay shek carr syndrome' and returns all synonyms ('retinal degeneration with nanophthalmos, cystic macular degeneration, and angle closure glaucoma', 'retinal degeneration, nanophthalmos, glaucoma', 'mackay shek carr syndrome')
            #considers the GARD ID as the lemma, and the search term as one form. maps the form to the lemma and then uses that lemma to find all related forms in the GARD dict. 
            elif searchterm in self.GARD_dict.keys():
                l = self.get_names_from_id(searchterm)
                l.sort(reverse=True, key=lambda x:len(x))
                print("SEARCH TERM MATCHED TO GARD DICTIONARY. SEARCHING FOR: ",l)
                return l

            else:
                #This can be replaced with some other common error in user input that is easily fixed
                searchterm = searchterm.replace('-',' ')
                searchterm = searchterm.replace("'s","")
                return self.autosearch(searchterm, matching-1)
        print("SEARCH TERM DID NOT MATCH TO GARD DICTIONARY. SEARCHING BY USER INPUT")
        return [searchterm]

    # Return a random GARD_ID Search Term list
    def random_disease(self) -> List[str]:
        import random
        gard_id = random.choice(self.GARD_id_list)
        return self.autosearch(gard_id)

In [2]:
gard_info = GARD_Search()

In [3]:
print(gard_info)

Instantiation: rd_identify = GARD_Search()
                  Calling: diseases, ids = rd_identify(text) 
                  Autosearch or get disease names from GARD ID: search_terms = rd_identify.autosearch(searchterm)
                  GARD ID List: rd_identify.GARD_id_list
                  Disease Dictionary: rd_identify.GARD_dict
               


In [4]:
GARD_id_dict = {gard_id:gard_info.get_names_from_id(gard_id) for gard_id in gard_info.GARD_id_list}

Logic of scoring

Score1:
```
if same_cluster:
    score1 = 5
else:
    score1 =0

```
Score 2:
```
length_comparison = gard_info.max_length*2

if same_cluster:
    score2 = 5
else:
    score2 = 3*(length_comparison-len(
    nltk_tokenize.word_tokenize(disease_name1))-len(
    nltk_tokenize.word_tokenize(disease_name2))
    )/length_comparison
```

In [5]:
thousand_dataset_dict = {
               "split":[],
               "sid":[],
               "score1":[],
               "score2":[],
               "term1":[],
               "gard_id1":[],
               "term2":[],
               "gard_id2":[],
              }

In [10]:
multiple_names = {k:v for k,v in GARD_id_dict.items() if len(v)>1}
single_names = {k:v for k,v in GARD_id_dict.items() if len(v)<2}

In [11]:
multiple_names

{'GARD:0000001': ['gracile syndrome',
  'finnish lactic acidosis with hepatic hemosiderosis',
  'fellman syndrome',
  'growth retardation, aminoaciduria, cholestasis, iron overload, lactic acidosis and early death',
  'finnish lethal neonatal metabolic syndrome',
  'fellman disease',
  'growth delay-aminoaciduria-cholestasis-iron overload-lactic acidosis-early death syndrome',
  'growth restriction-aminoaciduria-cholestasis-iron overload-lactic acidosis-early death syndrome'],
 'GARD:0000003': ['ablepharon macrostomia syndrome',
  'congenital ablepharon, absent eyelashes/eyebrows, macrostomia, auricular, nasal, genital and other systemic anomalies'],
 'GARD:0000004': ['acanthocheilonemiasis',
  'dipetalonemiasis',
  'dipetalonema infections',
  'acanthocheilonema perstans infection',
  'mansonella perstans'],
 'GARD:0000005': ['abetalipoproteinemia',
  'bassen kornzweig syndrome',
  'microsomal triglyceride transfer protein deficiency disease',
  'microsomal triglyceride transfer prote

In [15]:
rand_id = random.choice(list(multiple_names.keys()))
rand_id

'GARD:0009669'

In [12]:
single_names

{'GARD:0000091': ['malignant melanoma, childhood'],
 'GARD:0000101': ['centronuclear myopathy'],
 'GARD:0000161': ['saul wilkes stevenson syndrome'],
 'GARD:0000166': ['schizencephaly'],
 'GARD:0000183': ['bubonic plague'],
 'GARD:0000203': ["crohn's disease of the esophagus"],
 'GARD:0000206': ['nephronophthisis'],
 'GARD:0000219': ['congenital giant megaureter'],
 'GARD:0000225': ['radial ray agenesis'],
 'GARD:0000273': ['wrinkly skin syndrome'],
 'GARD:0000303': ['jeune syndrome situs inversus'],
 'GARD:0000317': ['sacral hemangiomas multiple congenital abnormalities'],
 'GARD:0000319': ['sacrococcygeal teratoma'],
 'GARD:0000412': ['cheilitis glandularis'],
 'GARD:0000431': ['microtia-anotia'],
 'GARD:0000456': ['achalasia microcephaly syndrome'],
 'GARD:0000478': ['acral dysostosis dyserythropoiesis syndrome'],
 'GARD:0000495': ['acrofacial dysostosis preis type'],
 'GARD:0000506': ['acromesomelic dysplasia hunter thompson type'],
 'GARD:0000519': ['idiopathic acute eosinophilic 

In [16]:
#Build the similar dataset
import random, itertools
i = 0
while i<500:
    gard_id = random.choice(list(multiple_names.keys()))
    rand_list = multiple_names[gard_id]
    pair_order_list = list(itertools.combinations(rand_list,2))
    #to avoid heavily weighting the non 
    if pair_order_list>3:
        pair_order_list = random.sample(pair_order_list,k=3)
        
    for term1, term2 in pair_order_list:
        thousand_dataset_dict["split"].append('train')
        thousand_dataset_dict["sid"].append(i)
        thousand_dataset_dict["term1"].append(term1)
        thousand_dataset_dict["term2"].append(term2)
        thousand_dataset_dict["gard_id1"].append(gard_id)
        thousand_dataset_dict["gard_id2"].append(gard_id)
        thousand_dataset_dict["score1"].append(5)
        thousand_dataset_dict["score2"].append(5)
        i+=1

ValueError: Sample larger than population or is negative

The number of pairings between names with same GARD ID is `n(n-1)/2`
(positive pairings)

In [25]:
total_positive_pairings = int(sum([(len(l)*(len(l)-1))/2 for l in GARD_id_dict.values()]))

print(f"{total_positive_pairings:,}")

36,509


In [21]:
all_names = []
for names in GARD_id_dict.values():
    for name in names:
        all_names.append(name)

print(len(all_names),len(set(all_names)))

num_names = len(all_names)

19423 19423


The number of pairing between names without same GARD ID is `num(entries not in cluster) * num(entries in cluster)`
(negative pairings)

In [26]:
total_negative_pairings = 0
for names in GARD_id_dict.values():
    total_negative_pairings+=(num_names - len(names))*len(names)
    
print(f"{total_negative_pairings:,}")

377,160,488


In [None]:
max_comparisons = int(sum([(len(l)*(len(l)-1))/2 for l in GARD_id_dict.values()]))
max_comparisons

In [None]:
import pandas

Similar comparisons

In [None]:
https://sbert.net/datasets/stsbenchmark.tsv.gz