# QuickUMLS ini analysis

In [1]:
import pandas as pd
import json
from quickumls import QuickUMLS
import ast

  from .autonotebook import tqdm as notebook_tqdm


Load QuickUMLS and semtype dictionary searcher run NER. Run under WSL or other Linux environment since quickumls installation works better on linux. Takes around 30s

In [2]:
quickumls_db = "/mnt/c/Users/maxji/.data/bio/quickUMLS" #root folder where quickumls is downloaded

matcher = QuickUMLS(
    quickumls_db,
    threshold=0.7,
    similarity_name='jaccard'
)

loading initialization

In [3]:
mrsty_path = "/mnt/c/Users/maxji/.data/bio/umls/2025AA/umls-2025AA-metathesaurus-full/2025AA/META/MRSTY.RRF"
def load_semtype_labels(mrsty_path):
    df = pd.read_csv(
        mrsty_path,
        sep='|',
        header=None,
        dtype=str,
        engine='python'
    )
    # Drop the last empty column if present
    if df.shape[1] > 6:
        df = df.iloc[:, :6]
    df.columns = ['CUI', 'TUI', 'STN', 'STY', 'ATUI', 'CVF']
    df = df[['TUI', 'STY']].drop_duplicates()
    return dict(zip(df['TUI'], df['STY']))
# Example usage
semtype_dict = load_semtype_labels(mrsty_path)
print(semtype_dict['T047'])  # Disease or Syndrome

Disease or Syndrome


# Part 1: reading in our past dataset

In [4]:
df_mini_pairs = pd.read_csv("english-train-paired-conversations.csv")
print("data length:", len(df_mini_pairs)) 
df_mini_pairs.head(n=5)

data length: 600


Unnamed: 0.1,Unnamed: 0,description,utterances,input,output
0,0,throat a bit sore and want to get a good imune...,['patient: throat a bit sore and want to get a...,patient: throat a bit sore and want to get a g...,doctor: during this pandemic. throat pain can ...
1,1,"hey there i have had cold ""symptoms"" for over ...","['patient: hey there i have had cold ""symptoms...","patient: hey there i have had cold ""symptoms"" ...",doctor: yes. protection. it is not enough symp...
2,2,i have a tight and painful chest with a dry co...,['patient: i have a tight and painful chest wi...,patient: i have a tight and painful chest with...,"doctor: possible. top symptoms include fever, ..."
3,3,what will happen after the incubation period f...,['patient: what will happen after the incubati...,patient: what will happen after the incubation...,doctor: in brief: symptoms if you are infected...
4,4,suggest treatment for pneumonia,['patient: just found out i was pregnant. yest...,patient: just found out i was pregnant. yester...,doctor: thanks for your question on healthcare...


# Using the QuickUMLS model on a single data sample, data format

Testing data on a single datapoint.

Split based on last : found

In [5]:
#
index = 0
patient = df_mini_pairs.iloc[index]["input"].replace("patient:","").strip() #raw patient string replacing doctor, patient
clinician = df_mini_pairs.iloc[index]["output"].replace("doctor:","").strip() #raw clinican stinrg
print(patient,"\n",clinician)

throat a bit sore and want to get a good imune booster, especially in light of the virus. please advise. have not been in contact with nyone with the virus. 
 during this pandemic. throat pain can be from a strep throat infection (antibiotics needed), a cold or influenza or other virus, or from some other cause such as allergies or irritants. usually, a person sees the doctor (call first) if the sore throat is bothersome, recurrent, or doesn't go away quickly. covid-19 infections tend to have cough, whereas strep throat usually lacks cough but has more throat pain. (3/21/20)


In [6]:
matches = matcher.match(patient, best_match=True, ignore_syntax=False)
for x in matches:
    #output all matches
    for final in x:
        print("term:", final["term"])
        semtypes_list = list(final["semtypes"])
        print("semtypes:", semtypes_list) #convert 
        #now, convert semtypes to codes using the dictionary matcher
        semtypes_matches = []
        for type in semtypes_list:
            semtypes_matches.append(semtype_dict[type])
        print("semtype matches:", semtypes_matches)
        print(final)

term: booster
semtypes: ['T061']
semtype matches: ['Therapeutic or Preventive Procedure']
{'start': 47, 'end': 54, 'ngram': 'booster', 'term': 'booster', 'cui': 'C0020975', 'similarity': 1.0, 'semtypes': {'T061'}, 'preferred': 1}
term: contact
semtypes: ['T067']
semtype matches: ['Phenomenon or Process']
{'start': 122, 'end': 129, 'ngram': 'contact', 'term': 'contact', 'cui': 'C0392367', 'similarity': 1.0, 'semtypes': {'T067'}, 'preferred': 1}
term: contact
semtypes: ['T170']
semtype matches: ['Intellectual Product']
{'start': 122, 'end': 129, 'ngram': 'contact', 'term': 'contact', 'cui': 'C3245509', 'similarity': 1.0, 'semtypes': {'T170'}, 'preferred': 1}
term: throat
semtypes: ['T023']
semtype matches: ['Body Part, Organ, or Organ Component']
{'start': 0, 'end': 6, 'ngram': 'throat', 'term': 'throat', 'cui': 'C3665375', 'similarity': 1.0, 'semtypes': {'T023'}, 'preferred': 1}
term: throats
semtypes: ['T023']
semtype matches: ['Body Part, Organ, or Organ Component']
{'start': 0, 'end'

Creating a generalized function to help find matches 


In [7]:
#returns a dictionary of terms to a tuple of semtype_list and matched terms_list with the same length

def quickumls_matcher(text):
    matches = matcher.match(text, best_match=True, ignore_syntax=False)
    terms_dictionary = {}
    for x in matches:
        #output all matches
        for final in x:
            semtypes_list = list(final["semtypes"])
            #now, convert semtypes to codes using the dictionary matcher
            semtypes_matches = []
            for type in semtypes_list:
                semtypes_matches.append(semtype_dict[type])
            terms_dictionary[final["term"]] = (semtypes_list, semtypes_matches)
    return terms_dictionary

In [8]:
text = "The patient was given aspirin for pain relief."
clinician_entity_dict = {}
terms_dictionary = quickumls_matcher(clinician)
print(terms_dictionary)
for term in terms_dictionary: #each term is a key
    data = terms_dictionary[term]
    semtypes_list = data[0]
    semtypes_matches = data[1]
    print("\n____________________________________________________\n")

{'throat infection': (['T047'], ['Disease or Syndrome']), 'strep throat': (['T047'], ['Disease or Syndrome']), 'throat pain': (['T184'], ['Sign or Symptom']), 'Throat pain': (['T184'], ['Sign or Symptom']), 'sore throat': (['T184'], ['Sign or Symptom']), 'No sore throat': (['T033'], ['Finding']), 'infections': (['T046'], ['Pathologic Function']), 'infection': (['T047'], ['Disease or Syndrome']), 'Reinfections': (['T046'], ['Pathologic Function']), 'Coinfections': (['T047'], ['Disease or Syndrome']), 'Infections': (['T046'], ['Pathologic Function']), 'Re-infections': (['T046'], ['Pathologic Function']), 'Co-infections': (['T047'], ['Disease or Syndrome']), 'infections op': (['T047'], ['Disease or Syndrome']), 'gi infections': (['T047'], ['Disease or Syndrome']), 'Reinfection': (['T046'], ['Pathologic Function']), 'Coinfection': (['T047'], ['Disease or Syndrome']), 'influenza': (['T047'], ['Disease or Syndrome']), 'influenza B': (['T047'], ['Disease or Syndrome']), 'influenza C': (['T047

# Finding Entities: Analyzing through the mini data using QuickUMLS


Finding the entity_groups found for both patient and clinician data, and the count of mapped entities for each patient category

In [9]:
patient_entity_dict = {}
clinician_entity_dict = {}
for index,row in df_mini_pairs.iterrows():
    input = row["input"]
    output = row["output"]
    
    try:  
        patient =  input.replace("patient:","").strip()
        terms_dictionary = quickumls_matcher(patient)
        #loop through the entities. Each term is a key
        for term in terms_dictionary:
            data = terms_dictionary[term]
            semtypes_list = data[0]
            semtypes_matches = data[1]
            #then, loop through the matches 
            for match in semtypes_matches:
                if match not in patient_entity_dict:
                    patient_entity_dict[match] = 1
                else:
                    patient_entity_dict[match] += 1
    except Exception as e:
        print(f"An error occurred in patient data: {e}")
        pass

    try: 
        clinician = output.replace("doctor:","").strip()
        terms_dictionary = quickumls_matcher(clinician)
        for term in terms_dictionary:
            data = terms_dictionary[term]
            semtypes_list = data[0]
            semtypes_matches = data[1]
            #then, loop through the matches 
            for match in semtypes_matches:
                if match not in clinician_entity_dict:
                    clinician_entity_dict[match] = 1
                else:
                    clinician_entity_dict[match] += 1
    except Exception as e:
        print(f"An error occurred in clinican data: {e}")
        pass


Analyzing the set of patient entity

In [10]:
print(patient_entity_dict)

{'Therapeutic or Preventive Procedure': 350, 'Intellectual Product': 733, 'Body Part, Organ, or Organ Component': 471, 'Disease or Syndrome': 1100, 'Sign or Symptom': 1104, 'Phenomenon or Process': 59, 'Finding': 1503, 'Organism Function': 154, 'Body Location or Region': 175, 'Laboratory or Test Result': 52, 'Laboratory Procedure': 54, 'Diagnostic Procedure': 109, 'Pharmacologic Substance': 429, 'Hormone': 18, 'Organic Chemical': 324, 'Medical Device': 161, 'Inorganic Chemical': 14, 'Mental or Behavioral Dysfunction': 65, 'Vitamin': 19, 'Mental Process': 247, 'Pathologic Function': 164, 'Health Care Activity': 313, 'Amino Acid, Peptide, or Protein': 24, 'Antibiotic': 247, 'Body Substance': 76, 'Physiologic Function': 52, 'Clinical Attribute': 107, 'Indicator, Reagent, or Diagnostic Aid': 33, 'Immunologic Factor': 24, 'Injury or Poisoning': 50, 'Neoplastic Process': 12, 'Hazardous or Poisonous Substance': 2, 'Clinical Drug': 26, 'Biologically Active Substance': 4, 'Element, Ion, or Isot

In [11]:
print(clinician_entity_dict)

{'Disease or Syndrome': 2029, 'Sign or Symptom': 901, 'Finding': 2160, 'Pathologic Function': 635, 'Clinical Attribute': 241, 'Phenomenon or Process': 90, 'Organism Function': 190, 'Antibiotic': 281, 'Intellectual Product': 1538, 'Body Substance': 87, 'Neoplastic Process': 27, 'Mental Process': 630, 'Mental or Behavioral Dysfunction': 44, 'Amino Acid, Peptide, or Protein': 38, 'Hormone': 35, 'Pharmacologic Substance': 1078, 'Therapeutic or Preventive Procedure': 722, 'Diagnostic Procedure': 161, 'Health Care Activity': 871, 'Organic Chemical': 522, 'Body Part, Organ, or Organ Component': 376, 'Vitamin': 44, 'Laboratory Procedure': 129, 'Inorganic Chemical': 48, 'Medical Device': 260, 'Physiologic Function': 73, 'Immunologic Factor': 55, 'Injury or Poisoning': 95, 'Body Location or Region': 95, 'Hazardous or Poisonous Substance': 13, 'Indicator, Reagent, or Diagnostic Aid': 49, 'Biologically Active Substance': 13, 'Clinical Drug': 16, 'Laboratory or Test Result': 71, 'Element, Ion, or I

Finding the sum of values in total

In [12]:
print("number of patient data:", sum(patient_entity_dict.values()))
print("number of clinican data:", sum(clinician_entity_dict.values()))

number of patient data: 8285
number of clinican data: 13653


Number of unique values in the keys

In [13]:
print( "number of unique patient entites:", len(set(patient_entity_dict.keys())) )

number of unique patient entites: 37


In [14]:
print( "number of unique clinican entites:", len(set(clinician_entity_dict.keys())) )

number of unique clinican entites: 38


Number of clinican entities is similar, with the only difference being:

In [15]:
set(clinician_entity_dict.keys()) - set(patient_entity_dict.keys())

{'Food', 'Manufactured Object'}

Sorting based on values

In [16]:
patient_entity_dict_sorted = sorted(patient_entity_dict.items(), key=lambda  item: item[1], reverse=True)
patient_entity_dict_sorted

[('Finding', 1503),
 ('Sign or Symptom', 1104),
 ('Disease or Syndrome', 1100),
 ('Intellectual Product', 733),
 ('Body Part, Organ, or Organ Component', 471),
 ('Pharmacologic Substance', 429),
 ('Therapeutic or Preventive Procedure', 350),
 ('Organic Chemical', 324),
 ('Health Care Activity', 313),
 ('Mental Process', 247),
 ('Antibiotic', 247),
 ('Body Location or Region', 175),
 ('Pathologic Function', 164),
 ('Medical Device', 161),
 ('Organism Function', 154),
 ('Diagnostic Procedure', 109),
 ('Clinical Attribute', 107),
 ('Body Substance', 76),
 ('Mental or Behavioral Dysfunction', 65),
 ('Phenomenon or Process', 59),
 ('Laboratory Procedure', 54),
 ('Laboratory or Test Result', 52),
 ('Physiologic Function', 52),
 ('Injury or Poisoning', 50),
 ('Indicator, Reagent, or Diagnostic Aid', 33),
 ('Clinical Drug', 26),
 ('Amino Acid, Peptide, or Protein', 24),
 ('Immunologic Factor', 24),
 ('Vitamin', 19),
 ('Hormone', 18),
 ('Inorganic Chemical', 14),
 ('Neoplastic Process', 12),
 (

In [17]:
clinican_entity_dict_sorted = sorted(clinician_entity_dict.items(), key=lambda  item: item[1], reverse=True)
clinican_entity_dict_sorted

[('Finding', 2160),
 ('Disease or Syndrome', 2029),
 ('Intellectual Product', 1538),
 ('Pharmacologic Substance', 1078),
 ('Sign or Symptom', 901),
 ('Health Care Activity', 871),
 ('Therapeutic or Preventive Procedure', 722),
 ('Pathologic Function', 635),
 ('Mental Process', 630),
 ('Organic Chemical', 522),
 ('Body Part, Organ, or Organ Component', 376),
 ('Antibiotic', 281),
 ('Medical Device', 260),
 ('Clinical Attribute', 241),
 ('Organism Function', 190),
 ('Diagnostic Procedure', 161),
 ('Laboratory Procedure', 129),
 ('Injury or Poisoning', 95),
 ('Body Location or Region', 95),
 ('Phenomenon or Process', 90),
 ('Body Substance', 87),
 ('Physiologic Function', 73),
 ('Laboratory or Test Result', 71),
 ('Immunologic Factor', 55),
 ('Indicator, Reagent, or Diagnostic Aid', 49),
 ('Inorganic Chemical', 48),
 ('Mental or Behavioral Dysfunction', 44),
 ('Vitamin', 44),
 ('Amino Acid, Peptide, or Protein', 38),
 ('Hormone', 35),
 ('Neoplastic Process', 27),
 ('Nucleic Acid, Nucleosi

Finding the set differences

In [18]:
set(patient_entity_dict.keys()).difference(set(clinician_entity_dict.keys()))

{'Congenital Abnormality'}

In [19]:
set(clinician_entity_dict.keys()).difference(set(patient_entity_dict.keys()))

{'Food', 'Manufactured Object'}

We can see that there are very few mismatched mapping

Create a csv for further analysis

In [20]:
analysis_df = pd.DataFrame([patient_entity_dict, clinician_entity_dict])
analysis_df.insert(0, "names", ["patient", "clinician"]) ## add column name
analysis_df.head()

Unnamed: 0,names,Therapeutic or Preventive Procedure,Intellectual Product,"Body Part, Organ, or Organ Component",Disease or Syndrome,Sign or Symptom,Phenomenon or Process,Finding,Organism Function,Body Location or Region,...,Injury or Poisoning,Neoplastic Process,Hazardous or Poisonous Substance,Clinical Drug,Biologically Active Substance,"Element, Ion, or Isotope","Nucleic Acid, Nucleoside, or Nucleotide",Congenital Abnormality,Food,Manufactured Object
0,patient,350,733,471,1100,1104,59,1503,154,175,...,50,12,2,26,4,2,6,2.0,,
1,clinician,722,1538,376,2029,901,90,2160,190,95,...,95,27,13,16,13,5,26,,1.0,4.0


In [21]:
#SAVING
analysis_df.to_csv("MedDialog_UMLS_entity_count.csv")

# Finding entity mismatches: looping through the minidata to find mismatches between entities the paired data

Now, we can perform a different type of analysis. Now, that we know that there is a discrepancy between total values, what are differences in the occurance per dialogue. We use the list of found entities before 

In [22]:
clinician_entity_dict.keys()

dict_keys(['Disease or Syndrome', 'Sign or Symptom', 'Finding', 'Pathologic Function', 'Clinical Attribute', 'Phenomenon or Process', 'Organism Function', 'Antibiotic', 'Intellectual Product', 'Body Substance', 'Neoplastic Process', 'Mental Process', 'Mental or Behavioral Dysfunction', 'Amino Acid, Peptide, or Protein', 'Hormone', 'Pharmacologic Substance', 'Therapeutic or Preventive Procedure', 'Diagnostic Procedure', 'Health Care Activity', 'Organic Chemical', 'Body Part, Organ, or Organ Component', 'Vitamin', 'Laboratory Procedure', 'Inorganic Chemical', 'Medical Device', 'Physiologic Function', 'Immunologic Factor', 'Injury or Poisoning', 'Body Location or Region', 'Hazardous or Poisonous Substance', 'Indicator, Reagent, or Diagnostic Aid', 'Biologically Active Substance', 'Clinical Drug', 'Laboratory or Test Result', 'Element, Ion, or Isotope', 'Nucleic Acid, Nucleoside, or Nucleotide', 'Food', 'Manufactured Object'])

In [23]:
#initialize a dictionary of mismatches 
patient_mismatch_dict = {} #entities mapped in patient text but not in the clinican text
clinician_mismatch_dict = {}  #entities mapped in clinican text but not in the patient text
for key in list(patient_entity_dict.keys()):
    patient_mismatch_dict[key] = 0
for key in list(clinician_entity_dict.keys()):
    clinician_mismatch_dict[key] = 0


In [24]:
#track the row indices with a patient mismatch mapping diex w
patient_mismatch_rows = []
#track the rows with a  clinican mismatch
clinician_mismatch_rows = []

for index,row in df_mini_pairs.iterrows():
    current_patient_entities = set()
    current_clinican_entities = set()
    input = row["input"]
    output = row["output"]
    
    try:  
        patient =  input.replace("patient:","").strip()
        terms_dictionary = quickumls_matcher(patient)
        #loop through the entities. Each term is a key
        for term in terms_dictionary:
            data = terms_dictionary[term]
            semtypes_list = data[0]
            semtypes_matches = data[1]
            #add matches to the set
            current_patient_entities.update(semtypes_matches)

    except Exception as e:
        print(f"An error occurred in patient data: {e}")
        pass

    try: 
        clinician = output.replace("doctor:","").strip()
        terms_dictionary = quickumls_matcher(clinician)
        for term in terms_dictionary:
            data = terms_dictionary[term]
            semtypes_list = data[0]
            semtypes_matches = data[1]
            #add matches to the set 
            current_clinican_entities.update(semtypes_matches)

    except:
        print("no clinican mapping at index:", index)
        pass 
    #find mismatches by computing set differneces 
    #entities mapped in patient text but not in the clinican text. increment the dictionary of mismatches 
    patient_mismatch = current_patient_entities - current_clinican_entities
    if len(patient_mismatch) > 0:
        patient_mismatch_rows.append(index) #if the number of mismatches is non zero, append
    for key in patient_mismatch:
        patient_mismatch_dict[key] += 1

    clinician_mismatch = current_clinican_entities - current_patient_entities
    if len(clinician_mismatch) > 0:
        clinician_mismatch_rows.append(index) #if the number of mismatches is non zero, append
    for key in clinician_mismatch:
        clinician_mismatch_dict[key] += 1



Printing sums of total mismatches

In [25]:
print("number of patient data:", sum(patient_mismatch_dict.values()))
print("number of clinician data:", sum(clinician_mismatch_dict.values()))

number of patient data: 1551
number of clinician data: 3109


number of rows of mismatches and percentages

In [26]:
print("number of patient data mismatches:", len(patient_mismatch_rows))
print("number of clinician data mismatches:", len(clinician_mismatch_rows))

number of patient data mismatches: 490
number of clinician data mismatches: 578


In [27]:
print("percent of rows with patient data mismatches:", len(patient_mismatch_rows)/len(df_mini_pairs)*100)
print("percent of rows with clinician data mismatches:", len(clinician_mismatch_rows)/len(df_mini_pairs)*100)

percent of rows with patient data mismatches: 81.66666666666667
percent of rows with clinician data mismatches: 96.33333333333334


printing out mismatch dictionary

In [28]:
#sort
patient_mismatch_dict_sorted = sorted(patient_mismatch_dict.items(), key=lambda  item: item[1], reverse=True)
patient_mismatch_dict_sorted

[('Sign or Symptom', 144),
 ('Body Part, Organ, or Organ Component', 139),
 ('Disease or Syndrome', 113),
 ('Body Location or Region', 100),
 ('Therapeutic or Preventive Procedure', 75),
 ('Organic Chemical', 72),
 ('Mental Process', 65),
 ('Organism Function', 64),
 ('Medical Device', 63),
 ('Finding', 51),
 ('Health Care Activity', 51),
 ('Clinical Attribute', 50),
 ('Diagnostic Procedure', 49),
 ('Pharmacologic Substance', 49),
 ('Intellectual Product', 46),
 ('Pathologic Function', 46),
 ('Phenomenon or Process', 42),
 ('Mental or Behavioral Dysfunction', 39),
 ('Body Substance', 39),
 ('Physiologic Function', 38),
 ('Laboratory or Test Result', 32),
 ('Antibiotic', 32),
 ('Indicator, Reagent, or Diagnostic Aid', 28),
 ('Injury or Poisoning', 24),
 ('Laboratory Procedure', 23),
 ('Clinical Drug', 13),
 ('Inorganic Chemical', 12),
 ('Amino Acid, Peptide, or Protein', 12),
 ('Hormone', 10),
 ('Immunologic Factor', 10),
 ('Vitamin', 4),
 ('Neoplastic Process', 4),
 ('Nucleic Acid, Nuc

In [29]:
clinician_mismatch_dict_sorted = sorted(clinician_mismatch_dict.items(), key=lambda  item: item[1], reverse=True)
clinician_mismatch_dict_sorted

[('Mental Process', 284),
 ('Pharmacologic Substance', 269),
 ('Therapeutic or Preventive Procedure', 219),
 ('Health Care Activity', 213),
 ('Intellectual Product', 198),
 ('Clinical Attribute', 160),
 ('Disease or Syndrome', 144),
 ('Pathologic Function', 141),
 ('Medical Device', 141),
 ('Organic Chemical', 135),
 ('Organism Function', 112),
 ('Body Part, Organ, or Organ Component', 108),
 ('Finding', 105),
 ('Laboratory Procedure', 78),
 ('Diagnostic Procedure', 77),
 ('Phenomenon or Process', 71),
 ('Sign or Symptom', 63),
 ('Body Substance', 56),
 ('Physiologic Function', 54),
 ('Laboratory or Test Result', 54),
 ('Injury or Poisoning', 50),
 ('Antibiotic', 47),
 ('Indicator, Reagent, or Diagnostic Aid', 46),
 ('Inorganic Chemical', 44),
 ('Body Location or Region', 43),
 ('Mental or Behavioral Dysfunction', 32),
 ('Immunologic Factor', 32),
 ('Amino Acid, Peptide, or Protein', 24),
 ('Nucleic Acid, Nucleoside, or Nucleotide', 24),
 ('Neoplastic Process', 14),
 ('Vitamin', 14),
 

Save mismatch data to a csv

In [30]:
analysis_df = pd.DataFrame([patient_mismatch_dict, clinician_mismatch_dict])
analysis_df.insert(0, "names", ["patient", "clinician"]) ## add column name
analysis_df.head()

Unnamed: 0,names,Therapeutic or Preventive Procedure,Intellectual Product,"Body Part, Organ, or Organ Component",Disease or Syndrome,Sign or Symptom,Phenomenon or Process,Finding,Organism Function,Body Location or Region,...,Injury or Poisoning,Neoplastic Process,Hazardous or Poisonous Substance,Clinical Drug,Biologically Active Substance,"Element, Ion, or Isotope","Nucleic Acid, Nucleoside, or Nucleotide",Congenital Abnormality,Food,Manufactured Object
0,patient,75,46,139,113,144,42,51,64,100,...,24,4,2,13,3,2,4,1.0,,
1,clinician,219,198,108,144,63,71,105,112,43,...,50,14,11,12,12,5,24,,1.0,3.0


In [31]:
analysis_df.to_csv("MedDialog_UMLS_entity_mismatch_count.csv")

## Looping to extract a set of terms, not entities

Finding a set of medical terms, not entities. This could be large


In [32]:
#helper function to get list of terms corresponding to a entity name
def getEntityWords(input : dict, name: str):
    if input == None:
        return []
    result = []
    for term in input:
        data = input[term]
        semtypes_list = data[0]
        semtypes_matches = data[1]
        for entity in semtypes_matches:
            if entity == name:
                result.append(term)

    return result

In [33]:
test = {'throat infection': (['T047'], ['Disease or Syndrome']), 'strep throat': (['T047'], ['Disease or Syndrome']), 'throat pain': (['T184'], ['Sign or Symptom']), 'Throat pain': (['T184'], ['Sign or Symptom']), 'sore throat': (['T184'], ['Sign or Symptom']), 'No sore throat': (['T033'], ['Finding']), 'infections': (['T046'], ['Pathologic Function']), 'infection': (['T047'], ['Disease or Syndrome']), 'Reinfections': (['T046'], ['Pathologic Function']), 'Coinfections': (['T047'], ['Disease or Syndrome']), 'Infections': (['T046'], ['Pathologic Function']), 'Re-infections': (['T046'], ['Pathologic Function']), 'Co-infections': (['T047'], ['Disease or Syndrome']), 'infections op': (['T047'], ['Disease or Syndrome']), 'gi infections': (['T047'], ['Disease or Syndrome']), 'Reinfection': (['T046'], ['Pathologic Function']), 'Coinfection': (['T047'], ['Disease or Syndrome']), 'influenza': (['T047'], ['Disease or Syndrome']), 'influenza B': (['T047'], ['Disease or Syndrome']), 'influenza C': (['T047'], ['Disease or Syndrome']), 'influenza A': (['T047'], ['Disease or Syndrome']), 'allergies': (['T046'], ['Pathologic Function']), 'Allergies': (['T201'], ['Clinical Attribute']), 'pandemic': (['T067'], ['Phenomenon or Process']), 'virus': (['T047'], ['Disease or Syndrome']), 'cold': (['T047'], ['Disease or Syndrome']), 'sees': (['T040'], ['Organism Function']), 'Antibiotics': (['T195'], ['Antibiotic']), 'antibiotic Y': (['T195'], ['Antibiotic']), 'antibiotics': (['T195'], ['Antibiotic']), 'antibiotic': (['T195'], ['Antibiotic']), 'coughs': (['T033'], ['Finding']), 'cough': (['T184'], ['Sign or Symptom'])}
getEntityWords(test, "Disease or Syndrome")

['throat infection',
 'strep throat',
 'infection',
 'Coinfections',
 'Co-infections',
 'infections op',
 'gi infections',
 'Coinfection',
 'influenza',
 'influenza B',
 'influenza C',
 'influenza A',
 'virus',
 'cold']

In [37]:
#track the row indices with a patient mismatch mapping diex w
patient_mismatch_rows = []
#track the rows with a  clinican mismatch
clinician_mismatch_rows = []

patient_entity_terms = []
clinician_entity_terms = []

patient_mismatch_type = [] #give the name of the mismatched entity that corresponds to the mismatch
clinician_mismatch_type = []

for index,row in df_mini_pairs.iterrows():
    current_patient_entities = set()
    current_clinican_entities = set()
    patient_results = None
    clinician_results = None
    input = row["input"]
    output = row["output"]
    
    try:  
        patient =  input.replace("patient:","").strip()
        terms_dictionary = quickumls_matcher(patient)
        patient_results = terms_dictionary
        #loop through the entities. Each term is a key
        for term in terms_dictionary:
            data = terms_dictionary[term]
            semtypes_list = data[0]
            semtypes_matches = data[1]
            #add matches to the set
            current_patient_entities.update(semtypes_matches)

    except Exception as e:
        print(f"An error occurred in patient data: {e}")
        pass

    try: 
        clinician = output.replace("doctor:","").strip()
        terms_dictionary = quickumls_matcher(clinician)
        clinician_results = terms_dictionary
        for term in terms_dictionary:
            data = terms_dictionary[term]
            semtypes_list = data[0]
            semtypes_matches = data[1]
            #add matches to the set 
            current_clinican_entities.update(semtypes_matches)

    except:
        print("no clinican mapping at index:", index)
        pass 
    #find mismatches by computing set differneces 
    #entities mapped in patient text but not in the clinican text. increment the dictionary of mismatches 
    patient_mismatch = current_patient_entities - current_clinican_entities
    if len(patient_mismatch) > 0:
        patient_mismatch_rows.append(index) #if the number of mismatches is non zero, append
    for key in patient_mismatch:
        patient_mismatch_dict[key] += 1
        try: #append mismatched terms for patient data
            result = getEntityWords(patient_results,key)
            patient_entity_terms += result 
            patient_mismatch_type += [key]*len(result) #mismatch type is the key mismatched
        except: 
            pass

    clinician_mismatch = current_clinican_entities - current_patient_entities
    if len(clinician_mismatch) > 0:
        clinician_mismatch_rows.append(index) #if the number of mismatches is non zero, append
    for key in clinician_mismatch:
        clinician_mismatch_dict[key] += 1
        try: #append mismatched terms for patient data
            result = getEntityWords(clinician_results,key)
            clinician_entity_terms += result 
            clinician_mismatch_type += [key]*len(result) #mismatch type is the key mismatched
        except: 
            pass



Printing out side of patient terms and clincians terms and using set similarity

In [38]:
print(len(patient_entity_terms))
print(len(patient_mismatch_type))
patient_entity_terms

2913
2913


['booster',
 'throat',
 'throats',
 'contact',
 'see',
 'coronavirus',
 'chest',
 'progesterones',
 'Hyprogesterone',
 '17α-progesterone',
 'progesterone',
 'progesterone level test',
 'Progesterone',
 'diagnosed',
 'cervix',
 'progesterone level',
 'Serum progesterone level',
 'weak',
 'progesterone iud',
 'Stalkings',
 'Stalking',
 'air',
 'teste',
 'appetite',
 'tested',
 'measles vaccination',
 'pandemic',
 'coronavirus',
 'virus',
 'Inpatient',
 'Reinfected',
 'back',
 'levaquin',
 'berythromycin',
 'erythromycin G',
 'erythromycin es',
 'erythromycin',
 'prednisone',
 'indamycin',
 'clindamycin',
 'prednisone',
 'feel',
 'positive blood culture',
 'phlegm',
 'Bextra',
 'teste',
 'Bextra',
 'teste',
 'fro',
 'sounding',
 'alarm',
 'nap',
 'alarms',
 'chest',
 'week',
 'mended',
 'mended',
 'chest x ray',
 'lungs',
 'coughing',
 'cough',
 'phlegm',
 'Right chest',
 'breathing',
 'Breathing',
 'Wry mouth',
 'dry mouth',
 'breathing difficulty',
 'Breathing difficulties',
 'tight che

In [39]:
print(len(clinician_entity_terms))
print(len(clinician_mismatch_type))
clinician_entity_terms

6510
6510


['Allergies',
 'infections',
 'Reinfections',
 'Infections',
 'Re-infections',
 'Reinfection',
 'allergies',
 'sees',
 'Antibiotics',
 'antibiotic Y',
 'antibiotics',
 'antibiotic',
 'hypertensin',
 'Shortness of breath',
 'cancer',
 'hypertensin',
 'hypertensin',
 'attentions',
 'attention',
 'sputum',
 'Inattention',
 'plication',
 'quarantine',
 'NB complication',
 'Complications',
 'remember',
 'complication',
 'IUD complication',
 'complication iud',
 'CNS complication',
 'guidelines',
 'guideline',
 'Guidelines',
 'provider',
 'self',
 'low',
 'Tissue',
 'isolations',
 'chat',
 'Diagnosis',
 'diagnosis',
 'like',
 'persistent fever',
 'Reinfected',
 'dry cough',
 'symptoms',
 'fever',
 'fevers',
 'tiredness',
 'contact',
 'self',
 'text',
 'understand',
 'happy',
 'hope',
 'healthcare',
 'treatment',
 'nontreatment',
 'co-treatment',
 'consult',
 'Antibiotics',
 'antibiotic Y',
 'antibiotics',
 'antibiotic',
 'Disinfection',
 'infection',
 'infections',
 'Reinfections',
 'Reinfec

computing jaccard similarity

In [40]:
pghd_set = set(patient_entity_terms)
clinician_set = set(clinician_entity_terms)

jaccard = len(pghd_set & clinician_set) / len(pghd_set | clinician_set)
print(f"Conceptual Overlap (Jaccard): {jaccard:.3f}")

Conceptual Overlap (Jaccard): 0.203


In [41]:
print(pghd_set-clinician_set)

{'Unchanged', 'progesterone level', 'couplet', 'mediastinal', 'Tingling', 'RF 90 minutes', 'A/N risk factors', 'augmentin duo', 'tonsils', 'metampicillin', 'itchy ears', 'uncomplicated hypertension', 'No dizziness', 'viral pneumonia', 'risk factors', 'nauseous', 'paranoia', 'delivery', 'removals', 'running nose', 'penicillin', 'Upper respiratory disease', 'Repression', 'growths', 'Sinus congestion', 'spit', 'dexamethasone oral', 'neck strained', 'Manual hospital bed', 'sounding', 'No risk factors', 'roxithromycin', 'testosterones', 'alarmed', 'Injections', 'stylet', 'indamycin', 'sore', 'spitting', 'T2 deficiency', 'fatigue', 'apathy', 'penicillin B', 'k deficiency', 'stomach', 'armpit', 'Other bacterial pneumonia', 'removal', 'complaint', 'light-headedness', 'medrol', 'brain', 'hoping', 'itching', 'ribs', 'fractured', 'shoulders', 'ceclor', 'energy', 'Inspiration', 'dexamethasone Pill', 'head', 'slipped', '17α-progesterone', 'burning sensations skin', 'nebulizers', 'left side of head'

Save results

In [42]:
mismatches_1 = pd.DataFrame({
    'word': patient_entity_terms,
    'entity': patient_mismatch_type
})
mismatches_1.to_csv("MedDialog_UMLS_patient_mismatch_terms.csv")

mismatches_2 = pd.DataFrame({
    'word': clinician_entity_terms,
    'entity': clinician_mismatch_type
})
mismatches_2.to_csv("MedDialog_UMLS_clinician_mismatch_terms.csv")

## future:

Showing (through concepts per 100 words): 
QuickUMLS ontology has beeter coveration