# QuickUMLS ini analysis

In [1]:
import pandas as pd
import json
from quickumls import QuickUMLS
import ast

Load QuickUMLS and semtype dictionary searcher run NER. Run under WSL since quickumls installation works better on linux. Takes around 30s

In [2]:
quickumls_db = "/mnt/c/Users/maxji/.data/bio/quickUMLS" #root folder where quickumls is downloaded

matcher = QuickUMLS(
    quickumls_db,
    threshold=0.7,
    similarity_name='jaccard'
)

loading initialization

In [3]:
mrsty_path = "/mnt/c/Users/maxji/.data/bio/umls/2025AA/umls-2025AA-metathesaurus-full/2025AA/META/MRSTY.RRF"
def load_semtype_labels(mrsty_path):
    df = pd.read_csv(
        mrsty_path,
        sep='|',
        header=None,
        dtype=str,
        engine='python'
    )
    # Drop the last empty column if present
    if df.shape[1] > 6:
        df = df.iloc[:, :6]
    df.columns = ['CUI', 'TUI', 'STN', 'STY', 'ATUI', 'CVF']
    df = df[['TUI', 'STY']].drop_duplicates()
    return dict(zip(df['TUI'], df['STY']))
# Example usage
semtype_dict = load_semtype_labels(mrsty_path)
print(semtype_dict['T047'])  # Disease or Syndrome

Disease or Syndrome


# Part 1: reading in our past dataset

In [4]:
df_mini_pairs = pd.read_csv("english-train-paired-conversations.csv")
print("data length:", len(df_mini_pairs)) 
df_mini_pairs.head(n=5)

data length: 480


Unnamed: 0.1,Unnamed: 0,description,utterances
0,0,throat a bit sore and want to get a good imune...,['patient: throat a bit sore and want to get a...
1,1,"hey there i have had cold ""symptoms"" for over ...","['patient: hey there i have had cold ""symptoms..."
2,2,i have a tight and painful chest with a dry co...,['patient: i have a tight and painful chest wi...
3,3,what will happen after the incubation period f...,['patient: what will happen after the incubati...
4,4,suggest treatment for pneumonia,['patient: just found out i was pregnant. yest...


convert the utterances to a list

In [5]:
#currently, each one is being interpreted as a string instread of a list
print(len(df_mini_pairs["utterances"][0]))
df_mini_pairs['utterances'] = df_mini_pairs['utterances'].apply(ast.literal_eval)
print(len(df_mini_pairs["utterances"][0]))

603
2


# Using the QuickUMLS model on a single data sample, data format

Testing data on a single datapoint.

Split based on last : found

In [6]:
#
index = 0
patient = df_mini_pairs.iloc[index]["utterances"][0].replace("patient:","").strip() #raw patient string replacing doctor, patient
clinician = df_mini_pairs.iloc[index]["utterances"][1].replace("doctor:","").strip() #raw clinican stinrg
print(patient,"\n",clinician)

throat a bit sore and want to get a good imune booster, especially in light of the virus. please advise. have not been in contact with nyone with the virus. 
 during this pandemic. throat pain can be from a strep throat infection (antibiotics needed), a cold or influenza or other virus, or from some other cause such as allergies or irritants. usually, a person sees the doctor (call first) if the sore throat is bothersome, recurrent, or doesn't go away quickly. covid-19 infections tend to have cough, whereas strep throat usually lacks cough but has more throat pain. (3/21/20)


In [7]:
matches = matcher.match(patient, best_match=True, ignore_syntax=False)
for x in matches:
    #output all matches
    for final in x:
        print("term:", final["term"])
        semtypes_list = list(final["semtypes"])
        print("semtypes:", semtypes_list) #convert 
        #now, convert semtypes to codes using the dictionary matcher
        semtypes_matches = []
        for type in semtypes_list:
            semtypes_matches.append(semtype_dict[type])
        print("semtype matches:", semtypes_matches)
        print(final)

term: booster
semtypes: ['T061']
semtype matches: ['Therapeutic or Preventive Procedure']
{'start': 47, 'end': 54, 'ngram': 'booster', 'term': 'booster', 'cui': 'C0020975', 'similarity': 1.0, 'semtypes': {'T061'}, 'preferred': 1}
term: contact
semtypes: ['T067']
semtype matches: ['Phenomenon or Process']
{'start': 122, 'end': 129, 'ngram': 'contact', 'term': 'contact', 'cui': 'C0392367', 'similarity': 1.0, 'semtypes': {'T067'}, 'preferred': 1}
term: contact
semtypes: ['T170']
semtype matches: ['Intellectual Product']
{'start': 122, 'end': 129, 'ngram': 'contact', 'term': 'contact', 'cui': 'C3245509', 'similarity': 1.0, 'semtypes': {'T170'}, 'preferred': 1}
term: throat
semtypes: ['T023']
semtype matches: ['Body Part, Organ, or Organ Component']
{'start': 0, 'end': 6, 'ngram': 'throat', 'term': 'throat', 'cui': 'C3665375', 'similarity': 1.0, 'semtypes': {'T023'}, 'preferred': 1}
term: throats
semtypes: ['T023']
semtype matches: ['Body Part, Organ, or Organ Component']
{'start': 0, 'end'

Creating a generalized function to help find matches 


In [8]:
#returns a dictionary of terms to a tuple of semtype_list and matched terms_list with the same length

def quickumls_matcher(text):
    matches = matcher.match(text, best_match=True, ignore_syntax=False)
    terms_dictionary = {}
    for x in matches:
        #output all matches
        for final in x:
            semtypes_list = list(final["semtypes"])
            #now, convert semtypes to codes using the dictionary matcher
            semtypes_matches = []
            for type in semtypes_list:
                semtypes_matches.append(semtype_dict[type])
            terms_dictionary[final["term"]] = (semtypes_list, semtypes_matches)
    return terms_dictionary

In [9]:
text = "The patient was given aspirin for pain relief."
clinician_entity_dict = {}
terms_dictionary = quickumls_matcher(clinician)
print(terms_dictionary)
for term in terms_dictionary: #each term is a key
    data = terms_dictionary[term]
    semtypes_list = data[0]
    semtypes_matches = data[1]
    print("\n____________________________________________________\n")

{'throat infection': (['T047'], ['Disease or Syndrome']), 'strep throat': (['T047'], ['Disease or Syndrome']), 'throat pain': (['T184'], ['Sign or Symptom']), 'Throat pain': (['T184'], ['Sign or Symptom']), 'sore throat': (['T184'], ['Sign or Symptom']), 'No sore throat': (['T033'], ['Finding']), 'infections': (['T046'], ['Pathologic Function']), 'infection': (['T047'], ['Disease or Syndrome']), 'Reinfections': (['T046'], ['Pathologic Function']), 'Coinfections': (['T047'], ['Disease or Syndrome']), 'Infections': (['T046'], ['Pathologic Function']), 'Re-infections': (['T046'], ['Pathologic Function']), 'Co-infections': (['T047'], ['Disease or Syndrome']), 'infections op': (['T047'], ['Disease or Syndrome']), 'gi infections': (['T047'], ['Disease or Syndrome']), 'Reinfection': (['T046'], ['Pathologic Function']), 'Coinfection': (['T047'], ['Disease or Syndrome']), 'influenza': (['T047'], ['Disease or Syndrome']), 'influenza B': (['T047'], ['Disease or Syndrome']), 'influenza C': (['T047

# Finding Entities: Analyzing through the mini data using QuickUMLS


Finding the entity_groups found for both patient and clinician data, and the count of mapped entities for each patient category

In [10]:
patient_entity_dict = {}
clinician_entity_dict = {}
for index,row in df_mini_pairs.iterrows():
    utterances = row["utterances"]
    
    try:  
        patient =  utterances[0].replace("patient:","").strip()
        terms_dictionary = quickumls_matcher(patient)
        #loop through the entities. Each term is a key
        for term in terms_dictionary:
            data = terms_dictionary[term]
            semtypes_list = data[0]
            semtypes_matches = data[1]
            #then, loop through the matches 
            for match in semtypes_matches:
                if match not in patient_entity_dict:
                    patient_entity_dict[match] = 1
                else:
                    patient_entity_dict[match] += 1
    except Exception as e:
        print(f"An error occurred in patient data: {e}")
        pass

    try: 
        clinician = utterances[1].replace("doctor:","").strip()
        terms_dictionary = quickumls_matcher(clinician)
        for term in terms_dictionary:
            data = terms_dictionary[term]
            semtypes_list = data[0]
            semtypes_matches = data[1]
            #then, loop through the matches 
            for match in semtypes_matches:
                if match not in clinician_entity_dict:
                    clinician_entity_dict[match] = 1
                else:
                    clinician_entity_dict[match] += 1
    except Exception as e:
        print(f"An error occurred in clinican data: {e}")
        pass


Analyzing the set of patient entity

In [11]:
print(patient_entity_dict)

{'Therapeutic or Preventive Procedure': 273, 'Intellectual Product': 563, 'Body Part, Organ, or Organ Component': 381, 'Disease or Syndrome': 862, 'Sign or Symptom': 866, 'Phenomenon or Process': 53, 'Finding': 1152, 'Organism Function': 109, 'Body Location or Region': 116, 'Laboratory or Test Result': 42, 'Laboratory Procedure': 41, 'Diagnostic Procedure': 84, 'Pharmacologic Substance': 339, 'Hormone': 11, 'Organic Chemical': 255, 'Medical Device': 117, 'Inorganic Chemical': 9, 'Mental or Behavioral Dysfunction': 50, 'Vitamin': 17, 'Mental Process': 187, 'Pathologic Function': 114, 'Health Care Activity': 237, 'Amino Acid, Peptide, or Protein': 22, 'Antibiotic': 205, 'Body Substance': 61, 'Physiologic Function': 40, 'Clinical Attribute': 82, 'Indicator, Reagent, or Diagnostic Aid': 25, 'Immunologic Factor': 23, 'Injury or Poisoning': 41, 'Neoplastic Process': 10, 'Hazardous or Poisonous Substance': 1, 'Clinical Drug': 22, 'Biologically Active Substance': 4, 'Element, Ion, or Isotope':

In [12]:
print(clinician_entity_dict)

{'Disease or Syndrome': 1662, 'Sign or Symptom': 714, 'Finding': 1711, 'Pathologic Function': 522, 'Clinical Attribute': 191, 'Phenomenon or Process': 72, 'Organism Function': 159, 'Antibiotic': 227, 'Intellectual Product': 1236, 'Body Substance': 69, 'Neoplastic Process': 21, 'Mental Process': 504, 'Mental or Behavioral Dysfunction': 37, 'Pharmacologic Substance': 867, 'Hormone': 35, 'Amino Acid, Peptide, or Protein': 33, 'Therapeutic or Preventive Procedure': 562, 'Diagnostic Procedure': 116, 'Health Care Activity': 646, 'Organic Chemical': 419, 'Body Part, Organ, or Organ Component': 286, 'Vitamin': 30, 'Laboratory Procedure': 98, 'Inorganic Chemical': 38, 'Medical Device': 209, 'Physiologic Function': 57, 'Immunologic Factor': 50, 'Injury or Poisoning': 72, 'Body Location or Region': 65, 'Hazardous or Poisonous Substance': 10, 'Indicator, Reagent, or Diagnostic Aid': 42, 'Biologically Active Substance': 10, 'Clinical Drug': 11, 'Laboratory or Test Result': 56, 'Element, Ion, or Iso

Finding the sum of values in total

In [13]:
print("number of patient data:", sum(patient_entity_dict.values()))
print("number of clinican data:", sum(clinician_entity_dict.values()))

number of patient data: 6423
number of clinican data: 10868


Number of unique values in the keys

In [14]:
print( "number of unique patient entites:", len(set(patient_entity_dict.keys())) )

number of unique patient entites: 37


In [15]:
print( "number of unique clinican entites:", len(set(clinician_entity_dict.keys())) )

number of unique clinican entites: 38


Number of clinican entities is similar, with the only difference being:

In [16]:
set(clinician_entity_dict.keys()) - set(patient_entity_dict.keys())

{'Food', 'Manufactured Object'}

Sorting based on values

In [17]:
patient_entity_dict_sorted = sorted(patient_entity_dict.items(), key=lambda  item: item[1], reverse=True)
patient_entity_dict_sorted

[('Finding', 1152),
 ('Sign or Symptom', 866),
 ('Disease or Syndrome', 862),
 ('Intellectual Product', 563),
 ('Body Part, Organ, or Organ Component', 381),
 ('Pharmacologic Substance', 339),
 ('Therapeutic or Preventive Procedure', 273),
 ('Organic Chemical', 255),
 ('Health Care Activity', 237),
 ('Antibiotic', 205),
 ('Mental Process', 187),
 ('Medical Device', 117),
 ('Body Location or Region', 116),
 ('Pathologic Function', 114),
 ('Organism Function', 109),
 ('Diagnostic Procedure', 84),
 ('Clinical Attribute', 82),
 ('Body Substance', 61),
 ('Phenomenon or Process', 53),
 ('Mental or Behavioral Dysfunction', 50),
 ('Laboratory or Test Result', 42),
 ('Laboratory Procedure', 41),
 ('Injury or Poisoning', 41),
 ('Physiologic Function', 40),
 ('Indicator, Reagent, or Diagnostic Aid', 25),
 ('Immunologic Factor', 23),
 ('Amino Acid, Peptide, or Protein', 22),
 ('Clinical Drug', 22),
 ('Vitamin', 17),
 ('Hormone', 11),
 ('Neoplastic Process', 10),
 ('Inorganic Chemical', 9),
 ('Nucl

In [18]:
clinican_entity_dict_sorted = sorted(clinician_entity_dict.items(), key=lambda  item: item[1], reverse=True)
clinican_entity_dict_sorted

[('Finding', 1711),
 ('Disease or Syndrome', 1662),
 ('Intellectual Product', 1236),
 ('Pharmacologic Substance', 867),
 ('Sign or Symptom', 714),
 ('Health Care Activity', 646),
 ('Therapeutic or Preventive Procedure', 562),
 ('Pathologic Function', 522),
 ('Mental Process', 504),
 ('Organic Chemical', 419),
 ('Body Part, Organ, or Organ Component', 286),
 ('Antibiotic', 227),
 ('Medical Device', 209),
 ('Clinical Attribute', 191),
 ('Organism Function', 159),
 ('Diagnostic Procedure', 116),
 ('Laboratory Procedure', 98),
 ('Phenomenon or Process', 72),
 ('Injury or Poisoning', 72),
 ('Body Substance', 69),
 ('Body Location or Region', 65),
 ('Physiologic Function', 57),
 ('Laboratory or Test Result', 56),
 ('Immunologic Factor', 50),
 ('Indicator, Reagent, or Diagnostic Aid', 42),
 ('Inorganic Chemical', 38),
 ('Mental or Behavioral Dysfunction', 37),
 ('Hormone', 35),
 ('Amino Acid, Peptide, or Protein', 33),
 ('Vitamin', 30),
 ('Neoplastic Process', 21),
 ('Nucleic Acid, Nucleoside

Finding the set differences

In [19]:
set(patient_entity_dict.keys()).difference(set(clinician_entity_dict.keys()))

{'Congenital Abnormality'}

In [20]:
set(clinician_entity_dict.keys()).difference(set(patient_entity_dict.keys()))

{'Food', 'Manufactured Object'}

We can see that there are few mappings. Additionally, we 

# Finding entity mismatches: looping through the minidata to find mismatches between entities the paired data

Now, we can perform a different type of analysis. Now, that we know that there is a discrepancy between total values, what are differences in the occurance per dialogue. We use the list of found entities before 

In [21]:
clinician_entity_dict.keys()

dict_keys(['Disease or Syndrome', 'Sign or Symptom', 'Finding', 'Pathologic Function', 'Clinical Attribute', 'Phenomenon or Process', 'Organism Function', 'Antibiotic', 'Intellectual Product', 'Body Substance', 'Neoplastic Process', 'Mental Process', 'Mental or Behavioral Dysfunction', 'Pharmacologic Substance', 'Hormone', 'Amino Acid, Peptide, or Protein', 'Therapeutic or Preventive Procedure', 'Diagnostic Procedure', 'Health Care Activity', 'Organic Chemical', 'Body Part, Organ, or Organ Component', 'Vitamin', 'Laboratory Procedure', 'Inorganic Chemical', 'Medical Device', 'Physiologic Function', 'Immunologic Factor', 'Injury or Poisoning', 'Body Location or Region', 'Hazardous or Poisonous Substance', 'Indicator, Reagent, or Diagnostic Aid', 'Biologically Active Substance', 'Clinical Drug', 'Laboratory or Test Result', 'Element, Ion, or Isotope', 'Nucleic Acid, Nucleoside, or Nucleotide', 'Food', 'Manufactured Object'])

In [22]:
#initialize a dictionary of mismatches 
patient_mismatch_dict = {} #entities mapped in patient text but not in the clinican text
clinician_mismatch_dict = {}  #entities mapped in clinican text but not in the patient text
for key in list(patient_entity_dict.keys()):
    patient_mismatch_dict[key] = 0
for key in list(clinician_entity_dict.keys()):
    clinician_mismatch_dict[key] = 0


In [23]:
#track the row indices with a patient mismatch mapping diex w
patient_mismatch_rows = []
#track the rows with a  clinican mismatch
clinician_mismatch_rows = []

for index,row in df_mini_pairs.iterrows():
    current_patient_entities = set()
    current_clinican_entities = set()
    utterances = row["utterances"]
    
    try:  
        patient =  utterances[0].replace("patient:","").strip()
        terms_dictionary = quickumls_matcher(patient)
        #loop through the entities. Each term is a key
        for term in terms_dictionary:
            data = terms_dictionary[term]
            semtypes_list = data[0]
            semtypes_matches = data[1]
            #add matches to the set
            current_patient_entities.update(semtypes_matches)

    except Exception as e:
        print(f"An error occurred in patient data: {e}")
        pass

    try: 
        clinician = utterances[1].replace("doctor:","").strip()
        terms_dictionary = quickumls_matcher(clinician)
        for term in terms_dictionary:
            data = terms_dictionary[term]
            semtypes_list = data[0]
            semtypes_matches = data[1]
            #add matches to the set 
            current_clinican_entities.update(semtypes_matches)

    except:
        print("no clinican mapping at index:", index)
        pass 
    #find mismatches by computing set differneces 
    #entities mapped in patient text but not in the clinican text. increment the dictionary of mismatches 
    patient_mismatch = current_patient_entities - current_clinican_entities
    if len(patient_mismatch) > 0:
        patient_mismatch_rows.append(index) #if the number of mismatches is non zero, append
    for key in patient_mismatch:
        patient_mismatch_dict[key] += 1

    clinician_mismatch = current_clinican_entities - current_patient_entities
    if len(clinician_mismatch) > 0:
        clinician_mismatch_rows.append(index) #if the number of mismatches is non zero, append
    for key in clinician_mismatch:
        clinician_mismatch_dict[key] += 1



Printing sums of total mismatches

In [24]:
print("number of patient data:", sum(patient_mismatch_dict.values()))
print("number of clinician data:", sum(clinician_mismatch_dict.values()))

number of patient data: 1208
number of clinician data: 2454


number of rows of mismatches and percentages

In [25]:
print("number of patient data mismatches:", len(patient_mismatch_rows))
print("number of clinician data mismatches:", len(clinician_mismatch_rows))

number of patient data mismatches: 388
number of clinician data mismatches: 464


In [26]:
print("percent of rows with patient data mismatches:", len(patient_mismatch_rows)/len(df_mini_pairs)*100)
print("percent of rows with clinician data mismatches:", len(clinician_mismatch_rows)/len(df_mini_pairs)*100)

percent of rows with patient data mismatches: 80.83333333333333
percent of rows with clinician data mismatches: 96.66666666666667


printing out mismatch dictionary

In [27]:
#sort
patient_mismatch_dict = sorted(patient_mismatch_dict.items(), key=lambda  item: item[1], reverse=True)
patient_mismatch_dict

[('Sign or Symptom', 115),
 ('Body Part, Organ, or Organ Component', 110),
 ('Disease or Syndrome', 89),
 ('Body Location or Region', 69),
 ('Therapeutic or Preventive Procedure', 60),
 ('Organic Chemical', 59),
 ('Mental Process', 55),
 ('Medical Device', 46),
 ('Health Care Activity', 44),
 ('Pharmacologic Substance', 41),
 ('Intellectual Product', 40),
 ('Finding', 39),
 ('Organism Function', 39),
 ('Clinical Attribute', 38),
 ('Phenomenon or Process', 37),
 ('Diagnostic Procedure', 37),
 ('Mental or Behavioral Dysfunction', 32),
 ('Body Substance', 31),
 ('Pathologic Function', 30),
 ('Antibiotic', 28),
 ('Physiologic Function', 27),
 ('Laboratory or Test Result', 24),
 ('Indicator, Reagent, or Diagnostic Aid', 21),
 ('Laboratory Procedure', 20),
 ('Injury or Poisoning', 18),
 ('Amino Acid, Peptide, or Protein', 11),
 ('Clinical Drug', 10),
 ('Immunologic Factor', 9),
 ('Inorganic Chemical', 8),
 ('Hormone', 7),
 ('Nucleic Acid, Nucleoside, or Nucleotide', 4),
 ('Biologically Activ

In [28]:
clinician_mismatch_dict = sorted(clinician_mismatch_dict.items(), key=lambda  item: item[1], reverse=True)
clinician_mismatch_dict

[('Mental Process', 229),
 ('Pharmacologic Substance', 213),
 ('Therapeutic or Preventive Procedure', 172),
 ('Health Care Activity', 158),
 ('Intellectual Product', 154),
 ('Clinical Attribute', 127),
 ('Pathologic Function', 115),
 ('Medical Device', 114),
 ('Organic Chemical', 113),
 ('Disease or Syndrome', 110),
 ('Finding', 89),
 ('Organism Function', 89),
 ('Body Part, Organ, or Organ Component', 81),
 ('Laboratory Procedure', 61),
 ('Phenomenon or Process', 56),
 ('Diagnostic Procedure', 55),
 ('Sign or Symptom', 47),
 ('Body Substance', 43),
 ('Laboratory or Test Result', 41),
 ('Physiologic Function', 40),
 ('Indicator, Reagent, or Diagnostic Aid', 40),
 ('Antibiotic', 38),
 ('Injury or Poisoning', 37),
 ('Inorganic Chemical', 36),
 ('Body Location or Region', 31),
 ('Mental or Behavioral Dysfunction', 28),
 ('Immunologic Factor', 27),
 ('Amino Acid, Peptide, or Protein', 21),
 ('Nucleic Acid, Nucleoside, or Nucleotide', 19),
 ('Hormone', 13),
 ('Vitamin', 12),
 ('Neoplastic P

## Looping to extract a set of terms, not entities

Finding a set of medical terms, not entities. This could be large


In [29]:
patient_entity_terms = {}
clinician_entity_terms = {}
for index,row in df_mini_pairs.iterrows():
    utterances = row["utterances"]
    
    try:  
        patient =  utterances[0].replace("patient:","").strip()
        patient_results = pipe(patient)
        #loop through the ntities
        for dict in patient_results:
            curr_entity = dict["word"]
            if curr_entity not in patient_entity_terms:
                patient_entity_terms[curr_entity] = 1
            else: 
                patient_entity_terms[curr_entity] = patient_entity_terms[curr_entity] + 1
    except:
        print("no patient mapping at index:", index)
        pass 

    try: 
        clinician = utterances[1].replace("doctor:","").strip()
        clinician_results = pipe(clinician)
        for dict in clinician_results:
            curr_entity = dict["word"]
            if curr_entity not in clinician_entity_terms:
                clinician_entity_terms[curr_entity] = 1
            else: 
                clinician_entity_terms[curr_entity] = clinician_entity_terms[curr_entity] + 1
    except:
        print("no clinican mapping at index:", index)
        pass 


no patient mapping at index: 0
no clinican mapping at index: 0
no patient mapping at index: 1
no clinican mapping at index: 1
no patient mapping at index: 2
no clinican mapping at index: 2
no patient mapping at index: 3
no clinican mapping at index: 3
no patient mapping at index: 4
no clinican mapping at index: 4
no patient mapping at index: 5
no clinican mapping at index: 5
no patient mapping at index: 6
no clinican mapping at index: 6
no patient mapping at index: 7
no clinican mapping at index: 7
no patient mapping at index: 8
no clinican mapping at index: 8
no patient mapping at index: 9
no clinican mapping at index: 9
no patient mapping at index: 10
no clinican mapping at index: 10
no patient mapping at index: 11
no clinican mapping at index: 11
no patient mapping at index: 12
no clinican mapping at index: 12
no patient mapping at index: 13
no clinican mapping at index: 13
no patient mapping at index: 14
no clinican mapping at index: 14
no patient mapping at index: 15
no clinican m

Printing out side of patient terms and clincians terms and using set similarity

In [30]:
print(len(patient_entity_terms))
patient_entity_terms

0


{}

In [31]:
print(len(clinician_entity_terms))
clinician_entity_terms

0


{}

computing jaccard similarity

In [32]:
pghd_set = set(patient_entity_terms.keys())
clinician_set = set(clinician_entity_terms.keys())

jaccard = len(pghd_set & clinician_set) / len(pghd_set | clinician_set)
print(f"Conceptual Overlap (Jaccard): {jaccard:.3f}")

ZeroDivisionError: division by zero

In [None]:
print(pghd_set-clinician_set)

## future:

Showing (through concepts per 100 words): 
QuickUMLS ontology has beeter coveration