## Part 1: Reading JSON file into a dataframe and saving/examining it.

In [1]:
import pandas as pd
import json

Large data length and size

In [2]:
from datasets import load_dataset


  from .autonotebook import tqdm as notebook_tqdm


Extract the BigBio dataset

In [3]:

# Load the English Combined dataset (BigBio version)
datasets = load_dataset("UCSD26/medical_dialog", "processed.en")

# Available splits: train, validation, test
print(datasets)

You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this dataset from the next major release of `datasets`.


DatasetDict({
    train: Dataset({
        features: ['description', 'utterances'],
        num_rows: 482
    })
    validation: Dataset({
        features: ['description', 'utterances'],
        num_rows: 60
    })
    test: Dataset({
        features: ['description', 'utterances'],
        num_rows: 61
    })
})


In [4]:
dfs = []
for key in datasets:
    df1 = pd.DataFrame(datasets[key])
    dfs.append(df1)
df_mini = pd.concat(dfs)

In [5]:
df_mini.head()

Unnamed: 0,description,utterances
0,throat a bit sore and want to get a good imune...,[patient: throat a bit sore and want to get a ...
1,"hey there i have had cold ""symptoms"" for over ...","[patient: hey there i have had cold ""symptoms""..."
2,i have a tight and painful chest with a dry co...,[patient: i have a tight and painful chest wit...
3,what will happen after the incubation period f...,[patient: what will happen after the incubatio...
4,suggest treatment for pneumonia,[patient: just found out i was pregnant. yeste...


## Further Processing the data: Using the BioBert model on a single data sample

### analyzing the minidata to see how many conversations are there with two utterances (two parts)

First, we need to convert the utterances row to a list

In [6]:
utterance_count_dict = {}
for index, row in df_mini.iterrows():
    utterances = row["utterances"]
    utterance_count = len(utterances)#get the current utterance count 
    if utterance_count not in utterance_count_dict: 
        utterance_count_dict[utterance_count] = 1
    else:
        utterance_count_dict[utterance_count] = utterance_count_dict[utterance_count] + 1

printing the length of utterances:

In [7]:
print(utterance_count_dict)

{2: 600, 17: 1, 4: 1, 8: 1}


As we can see the vast majority of the dataset contains 2 way utterances

### isolating the utterance pairs (they always start with patient)

In [8]:
df_mini_pairs = pd.DataFrame(df_mini[df_mini["utterances"].apply(len) == 2])
print("number of minipairs data:", len(df_mini_pairs))

number of minipairs data: 600


In [9]:
df_mini_pairs.head(n=3)


Unnamed: 0,description,utterances
0,throat a bit sore and want to get a good imune...,[patient: throat a bit sore and want to get a ...
1,"hey there i have had cold ""symptoms"" for over ...","[patient: hey there i have had cold ""symptoms""..."
2,i have a tight and painful chest with a dry co...,[patient: i have a tight and painful chest wit...


Example reading a line by index

In [10]:
#
index = 0
print("patient text:", df_mini_pairs.iloc[index]["utterances"][0])
print("clinician text:",df_mini_pairs.iloc[index]["utterances"][1])

patient text: patient: throat a bit sore and want to get a good imune booster, especially in light of the virus. please advise. have not been in contact with nyone with the virus.
clinician text: doctor: during this pandemic. throat pain can be from a strep throat infection (antibiotics needed), a cold or influenza or other virus, or from some other cause such as allergies or irritants. usually, a person sees the doctor (call first) if the sore throat is bothersome, recurrent, or doesn't go away quickly. covid-19 infections tend to have cough, whereas strep throat usually lacks cough but has more throat pain. (3/21/20)


changing to a standard format. Patient text is input and doctor is output

In [11]:
inputs = []
outputs = []
for index, row in df_mini_pairs.iterrows():
    utterances = row["utterances"]
    inputs.append(utterances[0])
    outputs.append(utterances[1])
#add columns
df_mini_pairs["input"] = inputs
df_mini_pairs["output"] = outputs

In [12]:
df_mini_pairs.head(n=5)

Unnamed: 0,description,utterances,input,output
0,throat a bit sore and want to get a good imune...,[patient: throat a bit sore and want to get a ...,patient: throat a bit sore and want to get a g...,doctor: during this pandemic. throat pain can ...
1,"hey there i have had cold ""symptoms"" for over ...","[patient: hey there i have had cold ""symptoms""...","patient: hey there i have had cold ""symptoms"" ...",doctor: yes. protection. it is not enough symp...
2,i have a tight and painful chest with a dry co...,[patient: i have a tight and painful chest wit...,patient: i have a tight and painful chest with...,"doctor: possible. top symptoms include fever, ..."
3,what will happen after the incubation period f...,[patient: what will happen after the incubatio...,patient: what will happen after the incubation...,doctor: in brief: symptoms if you are infected...
4,suggest treatment for pneumonia,[patient: just found out i was pregnant. yeste...,patient: just found out i was pregnant. yester...,doctor: thanks for your question on healthcare...


Merging the larger dataset

In [13]:
large_100 = pd.read_json("../data/HealthCareMagic-100k.json")
print(len(large_100))

large_100.head(n=1)

112165


Unnamed: 0,instruction,input,output
0,"If you are a doctor, please answer the medical...",I woke up this morning feeling the whole room ...,"Hi, Thank you for posting your query. The most..."


Merge dataset and join

In [14]:
df_mini_pairs = pd.concat([df_mini_pairs, large_100])
print(len(df_mini_pairs))
df_mini_pairs = df_mini_pairs[["input", "output"]]
df_mini_pairs.head(n=1)


112765


Unnamed: 0,input,output
0,patient: throat a bit sore and want to get a g...,doctor: during this pandemic. throat pain can ...


Saving the paired data

In [15]:
df_mini_pairs.to_csv("english-train-paired-conversations.csv")

### now, testing BioBert on this data 

We load a fine-tuned version of BioBert

In [16]:
from transformers import pipeline
from transformers import AutoTokenizer, AutoModelForTokenClassification

tokenizer = AutoTokenizer.from_pretrained("d4data/biomedical-ner-all")
model = AutoModelForTokenClassification.from_pretrained("d4data/biomedical-ner-all")

pipe = pipeline("ner", model=model, tokenizer=tokenizer, aggregation_strategy="simple") # pass device=0 if using gpu
pipe("""The patient reported no recurrence of palpitations at follow-up 6 months after the ablation.""")

Device set to use cpu


[{'entity_group': 'Sign_symptom',
  'score': np.float32(0.9999311),
  'word': 'pal',
  'start': 38,
  'end': 41},
 {'entity_group': 'Sign_symptom',
  'score': np.float32(0.90633166),
  'word': '##pitations',
  'start': 41,
  'end': 50},
 {'entity_group': 'Clinical_event',
  'score': np.float32(0.99975544),
  'word': 'follow',
  'start': 54,
  'end': 60},
 {'entity_group': 'Date',
  'score': np.float32(0.999867),
  'word': '6 months after',
  'start': 64,
  'end': 78}]

Testing data on a single datapoint.

Split based on last : found

In [17]:
#
index = 0
patient = df_mini_pairs.iloc[index]["input"].replace("patient:","").strip() #raw patient string replacing doctor, patient
clinician = df_mini_pairs.iloc[index]["output"].replace("doctor:","").strip() #raw clinican stinrg
print(patient,"\n",clinician)

throat a bit sore and want to get a good imune booster, especially in light of the virus. please advise. have not been in contact with nyone with the virus. 
 during this pandemic. throat pain can be from a strep throat infection (antibiotics needed), a cold or influenza or other virus, or from some other cause such as allergies or irritants. usually, a person sees the doctor (call first) if the sore throat is bothersome, recurrent, or doesn't go away quickly. covid-19 infections tend to have cough, whereas strep throat usually lacks cough but has more throat pain. (3/21/20)


In [18]:
pipe(patient)

[{'entity_group': 'Biological_structure',
  'score': np.float32(0.9998447),
  'word': 'throat',
  'start': 0,
  'end': 6},
 {'entity_group': 'Sign_symptom',
  'score': np.float32(0.9993999),
  'word': 'sore',
  'start': 13,
  'end': 17}]

In [19]:
result = pipe(clinician)
print(result)

[{'entity_group': 'Biological_structure', 'score': np.float32(0.99981815), 'word': 'throat', 'start': 22, 'end': 28}, {'entity_group': 'Sign_symptom', 'score': np.float32(0.9998851), 'word': 'pain', 'start': 29, 'end': 33}, {'entity_group': 'Biological_structure', 'score': np.float32(0.999597), 'word': 'throat', 'start': 54, 'end': 60}, {'entity_group': 'Sign_symptom', 'score': np.float32(0.830349), 'word': 'sore', 'start': 240, 'end': 244}, {'entity_group': 'Biological_structure', 'score': np.float32(0.95489234), 'word': 'throat', 'start': 245, 'end': 251}, {'entity_group': 'Disease_disorder', 'score': np.float32(0.4631344), 'word': 'co', 'start': 306, 'end': 308}, {'entity_group': 'Disease_disorder', 'score': np.float32(0.57466906), 'word': '##vid - 19', 'start': 308, 'end': 314}, {'entity_group': 'Disease_disorder', 'score': np.float32(0.9928797), 'word': 'infections', 'start': 315, 'end': 325}, {'entity_group': 'Biological_structure', 'score': np.float32(0.8288955), 'word': 'st', '

In [20]:
result[0]["entity_group"]

'Biological_structure'

# Finding entities: Analyzing through the mini data using BioBert


Finding the entity_groups found for both patient and clinician data, and the count of mapped entities for each patient category

In [None]:
patient_entity_dict = {}
clinician_entity_dict = {}
for index,row in df_mini_pairs.iterrows():
    input = row["input"]
    output = row["output"]
    try:  
        patient =  input.replace("patient:","").strip()
        patient_results = pipe(patient)
        #loop through the ntities
        for dict in patient_results:
            curr_entity = dict["entity_group"]
            if curr_entity not in patient_entity_dict:
                patient_entity_dict[curr_entity] = 1
            else: 
                patient_entity_dict[curr_entity] = patient_entity_dict[curr_entity] + 1
    except:
        print("no patient mapping at index:", index)
        pass 

    try: 
        clinician = output.replace("doctor:","").strip()
        clinician_results = pipe(clinician)
        for dict in clinician_results:
            curr_entity = dict["entity_group"]
            if curr_entity not in clinician_entity_dict:
                clinician_entity_dict[curr_entity] = 1
            else: 
                clinician_entity_dict[curr_entity] = clinician_entity_dict[curr_entity] + 1
    except:
        print("no clinican mapping at index:", index)
        pass 


no patient mapping at index: 35089


Analyzing the set of patient entity

In [None]:
print(patient_entity_dict)

{'Biological_structure': 453, 'Sign_symptom': 1196, 'Detailed_description': 403, 'Duration': 210, 'Severity': 105, 'Nonbiological_location': 238, 'Disease_disorder': 471, 'Coreference': 59, 'Clinical_event': 141, 'Subject': 172, 'Date': 182, 'Activity': 120, 'Medication': 575, 'Lab_value': 237, 'Outcome': 23, 'Frequency': 71, 'Age': 81, 'Dosage': 76, 'Diagnostic_procedure': 401, 'Time': 76, 'Texture': 5, 'History': 121, 'Therapeutic_procedure': 113, 'Sex': 10, 'Quantitative_concept': 15, 'Administration': 29, 'Other_event': 12, 'Occupation': 17, 'Distance': 5, 'Other_entity': 9, 'Color': 4, 'Family_history': 14, 'Personal_background': 5, 'Area': 1}


In [None]:
print(clinician_entity_dict)

{'Biological_structure': 351, 'Sign_symptom': 794, 'Disease_disorder': 588, 'Medication': 548, 'Detailed_description': 541, 'Date': 34, 'Clinical_event': 258, 'Lab_value': 219, 'Duration': 93, 'Diagnostic_procedure': 527, 'Severity': 67, 'Activity': 331, 'Administration': 57, 'Therapeutic_procedure': 238, 'Frequency': 30, 'Coreference': 50, 'Family_history': 12, 'Nonbiological_location': 185, 'Subject': 55, 'Time': 18, 'History': 48, 'Other_event': 16, 'Age': 14, 'Dosage': 26, 'Outcome': 8, 'Sex': 4, 'Other_entity': 16, 'Occupation': 3, 'Distance': 6, 'Quantitative_concept': 2, 'Texture': 2, 'Personal_background': 3, 'Color': 1}


Finding the sum of values in total

In [None]:
print("number of patient data:", sum(patient_entity_dict.values()))
print("number of clinicain data:", sum(clinician_entity_dict.values()))

number of patient data: 5650
number of clinicain data: 5145


Since 480 total datapoints, on  patient data is mapped : (per dialogue)

In [None]:
4440/480

9.25

for clinican data:

In [None]:
4082/480

8.504166666666666

Sorting based on values

In [None]:
patient_entity_dict_sorted = sorted(patient_entity_dict.items(), key=lambda  item: item[1], reverse=True)
patient_entity_dict_sorted

[('Sign_symptom', 1196),
 ('Medication', 575),
 ('Disease_disorder', 471),
 ('Biological_structure', 453),
 ('Detailed_description', 403),
 ('Diagnostic_procedure', 401),
 ('Nonbiological_location', 238),
 ('Lab_value', 237),
 ('Duration', 210),
 ('Date', 182),
 ('Subject', 172),
 ('Clinical_event', 141),
 ('History', 121),
 ('Activity', 120),
 ('Therapeutic_procedure', 113),
 ('Severity', 105),
 ('Age', 81),
 ('Dosage', 76),
 ('Time', 76),
 ('Frequency', 71),
 ('Coreference', 59),
 ('Administration', 29),
 ('Outcome', 23),
 ('Occupation', 17),
 ('Quantitative_concept', 15),
 ('Family_history', 14),
 ('Other_event', 12),
 ('Sex', 10),
 ('Other_entity', 9),
 ('Texture', 5),
 ('Distance', 5),
 ('Personal_background', 5),
 ('Color', 4),
 ('Area', 1)]

In [None]:
clinican_entity_dict_sorted = sorted(clinician_entity_dict.items(), key=lambda  item: item[1], reverse=True)
clinican_entity_dict_sorted

[('Sign_symptom', 794),
 ('Disease_disorder', 588),
 ('Medication', 548),
 ('Detailed_description', 541),
 ('Diagnostic_procedure', 527),
 ('Biological_structure', 351),
 ('Activity', 331),
 ('Clinical_event', 258),
 ('Therapeutic_procedure', 238),
 ('Lab_value', 219),
 ('Nonbiological_location', 185),
 ('Duration', 93),
 ('Severity', 67),
 ('Administration', 57),
 ('Subject', 55),
 ('Coreference', 50),
 ('History', 48),
 ('Date', 34),
 ('Frequency', 30),
 ('Dosage', 26),
 ('Time', 18),
 ('Other_event', 16),
 ('Other_entity', 16),
 ('Age', 14),
 ('Family_history', 12),
 ('Outcome', 8),
 ('Distance', 6),
 ('Sex', 4),
 ('Occupation', 3),
 ('Personal_background', 3),
 ('Quantitative_concept', 2),
 ('Texture', 2),
 ('Color', 1)]

Finding the set differences

In [None]:
set(patient_entity_dict.keys()).difference(set(clinician_entity_dict.keys()))

{'Area'}

In [None]:
set(clinician_entity_dict.keys()).difference(set(patient_entity_dict.keys()))

set()

We can see there is only one mapping with no overlaps

create csv file for easy analysis later. We can easily create a dataframe from the two dictionaries

In [None]:
analysis_df = pd.DataFrame([patient_entity_dict, clinician_entity_dict])
analysis_df.insert(0, "names", ["patient", "clinician"]) ## add column name
analysis_df.head()

Unnamed: 0,names,Biological_structure,Sign_symptom,Detailed_description,Duration,Severity,Nonbiological_location,Disease_disorder,Coreference,Clinical_event,...,Quantitative_concept,Administration,Other_event,Occupation,Distance,Other_entity,Color,Family_history,Personal_background,Area
0,patient,453,1196,403,210,105,238,471,59,141,...,15,29,12,17,5,9,4,14,5,1.0
1,clinician,351,794,541,93,67,185,588,50,258,...,2,57,16,3,6,16,1,12,3,


Save this file

In [None]:
analysis_df.to_csv("Combined_BERT_entity_count.csv")

# Finding Mismatches : looping through the minidata to find mismatches between entities the paired data

Now, we can perform a different type of analysis. Now, that we know that there is a discrepancy between total values, what are differences in the occurance per dialogue. We use the list of found entities before 

In [None]:
clinician_entity_dict.keys()

dict_keys(['Biological_structure', 'Sign_symptom', 'Disease_disorder', 'Medication', 'Detailed_description', 'Date', 'Clinical_event', 'Lab_value', 'Duration', 'Diagnostic_procedure', 'Severity', 'Activity', 'Administration', 'Therapeutic_procedure', 'Frequency', 'Coreference', 'Family_history', 'Nonbiological_location', 'Subject', 'Time', 'History', 'Other_event', 'Age', 'Dosage', 'Outcome', 'Sex', 'Other_entity', 'Occupation', 'Distance', 'Quantitative_concept', 'Texture', 'Personal_background', 'Color'])

In [None]:
#initialize a dictionary of mismatches 
patient_mismatch_dict = {} #entities mapped in patient text but not in the clinican text
clinician_mismatch_dict = {}  #entities mapped in clinican text but not in the patient text
for key in list(patient_entity_dict.keys()):
    patient_mismatch_dict[key] = 0
for key in list(clinician_entity_dict.keys()):
    clinician_mismatch_dict[key] = 0


In [None]:
#track the row indices with a patient mismatch mapping diex w
patient_mismatch_rows = []
#track the rows with a  clinican mismatch
clinician_mismatch_rows = []

for index,row in df_mini_pairs.iterrows():
    current_patient_entities = set()
    current_clinican_entities = set()
    input = row["input"]
    output = row["output"]
    
    try:  
        patient =  input.replace("patient:","").strip()
        patient_results = pipe(patient)
        #loop through the ntities
        for dict in patient_results:
            curr_entity = dict["entity_group"]
            current_patient_entities.add(curr_entity)
    except:
        print("no patient mapping at index:", index)
        pass 

    try: 
        clinician = output.replace("doctor:","").strip()
        clinician_results = pipe(clinician)
        for dict in clinician_results:
            curr_entity = dict["entity_group"]
            current_clinican_entities.add(curr_entity)
    except:
        print("no clinican mapping at index:", index)
        pass 
    #find mismatches by computing set differneces 
    #entities mapped in patient text but not in the clinican text. increment the dictionary of mismatches 
    patient_mismatch = current_patient_entities - current_clinican_entities
    if len(patient_mismatch) > 0:
        patient_mismatch_rows.append(index) #if the number of mismatches is non zero, append
    for key in patient_mismatch:
        patient_mismatch_dict[key] += 1

    clinician_mismatch = current_clinican_entities - current_patient_entities
    if len(clinician_mismatch) > 0:
        clinician_mismatch_rows.append(index) #if the number of mismatches is non zero, append
    for key in clinician_mismatch:
        clinician_mismatch_dict[key] += 1


Printing sums of total mismatches

In [None]:
print("number of patient data:", sum(patient_mismatch_dict.values()))
print("number of clinician data:", sum(clinician_mismatch_dict.values()))

number of patient data: 1992
number of clinician data: 1663


number of rows of mismatches and percentages

In [None]:
print("percent of rows with patient data mismatches:", len(patient_mismatch_rows)/len(df_mini_pairs)*100)
print("percent of rows with clinician data mismatches:", len(clinician_mismatch_rows)/len(df_mini_pairs)*100)

percent of rows with patient data mismatches: 92.0
percent of rows with clinician data mismatches: 90.0


printing out mismatch dictionary with sorting

In [None]:
patient_mismatch_dict_sorted = sorted(patient_mismatch_dict.items(), key=lambda  item: item[1], reverse=True)
patient_mismatch_dict

{'Biological_structure': 128,
 'Sign_symptom': 169,
 'Detailed_description': 105,
 'Duration': 118,
 'Severity': 73,
 'Nonbiological_location': 107,
 'Disease_disorder': 143,
 'Coreference': 37,
 'Clinical_event': 59,
 'Subject': 105,
 'Date': 109,
 'Activity': 52,
 'Medication': 114,
 'Lab_value': 105,
 'Outcome': 20,
 'Frequency': 54,
 'Age': 73,
 'Dosage': 29,
 'Diagnostic_procedure': 117,
 'Time': 58,
 'Texture': 5,
 'History': 68,
 'Therapeutic_procedure': 50,
 'Sex': 10,
 'Quantitative_concept': 15,
 'Administration': 17,
 'Other_event': 11,
 'Occupation': 15,
 'Distance': 3,
 'Other_entity': 8,
 'Color': 4,
 'Family_history': 7,
 'Personal_background': 3,
 'Area': 1}

In [None]:
clinician_mismatch_dict_sorted = sorted(clinician_mismatch_dict.items(), key=lambda  item: item[1], reverse=True)
clinician_mismatch_dict_sorted

[('Detailed_description', 185),
 ('Activity', 172),
 ('Clinical_event', 155),
 ('Diagnostic_procedure', 130),
 ('Disease_disorder', 127),
 ('Medication', 122),
 ('Therapeutic_procedure', 110),
 ('Nonbiological_location', 105),
 ('Lab_value', 88),
 ('Sign_symptom', 76),
 ('Biological_structure', 65),
 ('Duration', 49),
 ('Severity', 46),
 ('Administration', 32),
 ('Coreference', 30),
 ('Subject', 23),
 ('Date', 19),
 ('History', 19),
 ('Frequency', 17),
 ('Time', 17),
 ('Other_event', 14),
 ('Dosage', 12),
 ('Other_entity', 11),
 ('Age', 10),
 ('Family_history', 9),
 ('Outcome', 6),
 ('Distance', 6),
 ('Quantitative_concept', 2),
 ('Texture', 2),
 ('Sex', 1),
 ('Occupation', 1),
 ('Personal_background', 1),
 ('Color', 1)]

Save this data to csv for further analysis

In [None]:
analysis_df = pd.DataFrame([patient_mismatch_dict, clinician_mismatch_dict])
analysis_df.insert(0, "names", ["patient", "clinician"]) ## add column name
analysis_df.head()

Unnamed: 0,names,Biological_structure,Sign_symptom,Detailed_description,Duration,Severity,Nonbiological_location,Disease_disorder,Coreference,Clinical_event,...,Quantitative_concept,Administration,Other_event,Occupation,Distance,Other_entity,Color,Family_history,Personal_background,Area
0,patient,128,169,105,118,73,107,143,37,59,...,15,17,11,15,3,8,4,7,3,1.0
1,clinician,65,76,185,49,46,105,127,30,155,...,2,32,14,1,6,11,1,9,1,


In [None]:
analysis_df.to_csv("Combined_BERT_entity_mismatch_count.csv")

## Looping to extract a set of terms, not entities

In [None]:
#create a count dictionary easily, this will help count the number of mismatches corresponding to a certain term
from collections import Counter

my_list = ["a", "b","c", "c","a","d" ]
counts_dict = Counter(my_list)
print(counts_dict)
#helper function to get list of terms corresponding to a entity name
def getEntityWords(input : list, name: str):
    if input == None:
        return []
    result = []
    for x in input:
        if(x["entity_group"] == name):
            result.append(x["word"])
    return result

Counter({'a': 2, 'c': 2, 'b': 1, 'd': 1})


Finding a set of medical terms that are mismatched


In [None]:
#track the row indices with a patient mismatch mapping diex w
patient_mismatch_rows = []
#track the rows with a  clinican mismatch
clinician_mismatch_rows = []

patient_entity_terms = []
clinician_entity_terms = []

patient_mismatch_type = [] #give the name of the mismatched entity that corresponds to the mismatch
clinician_mismatch_type = []


for index,row in df_mini_pairs.iterrows():
    current_patient_entities = set()
    current_clinican_entities = set()
    input = row["input"]
    output = row["output"]
    patient_results = None
    clinician_results = None
    try:  
        patient =  input.replace("patient:","").strip()
        patient_results = pipe(patient)
        #loop through the entities
        for dict in patient_results:
            curr_entity = dict["entity_group"]
            current_patient_entities.add(curr_entity)
    except:
        print("no patient mapping at index:", index)
        pass 

    try: 
        clinician = output.replace("doctor:","").strip()
        clinician_results = pipe(clinician)
        for dict in clinician_results:
            curr_entity = dict["entity_group"]
            current_clinican_entities.add(curr_entity)
    except:
        print("no clinican mapping at index:", index)
        pass 

    #find mismatches by computing set differneces 
    #entities mapped in patient text but not in the clinican text. increment the dictionary of mismatches 
    patient_mismatch = current_patient_entities - current_clinican_entities
    if len(patient_mismatch) > 0:
        patient_mismatch_rows.append(index) #if the number of mismatches is non zero, append
    for key in patient_mismatch:
        patient_mismatch_dict[key] += 1
        try: #append mismatched terms for patient data
            result = getEntityWords(patient_results,key)
            patient_entity_terms += result 
            patient_mismatch_type += [key]*len(result) #mismatch type is the key mismatched
        except: 
            pass
        
    clinician_mismatch = current_clinican_entities - current_patient_entities
    if len(clinician_mismatch) > 0:
        clinician_mismatch_rows.append(index) #if the number of mismatches is non zero, append
    for key in clinician_mismatch:
        clinician_mismatch_dict[key] += 1
        try: #append mismatched terms for clinician data
            result = getEntityWords(clinician_results,key)
            clinician_entity_terms += result
            clinician_mismatch_type += [key]*len(result) 
        except: 
            pass

Printing out side of patient terms and clincians terms and using set similarity

In [None]:
print(len(patient_entity_terms))
print(len(patient_mismatch_type))
patient_entity_terms

3335
3335


['over a week',
 'for the past two days',
 'low',
 'dr',
 'dr',
 'chest',
 'tight',
 'painful',
 'dry',
 'coronavirus',
 '19',
 'every',
 'high',
 '38 weeks',
 'daughter',
 'birth',
 'ce',
 'mis',
 '##rest',
 'low',
 'talking',
 'coronavirus',
 'virus',
 '36',
 'dirrhea',
 '##les',
 'vaccination',
 '10 month old',
 'son',
 'due',
 'me',
 '##as',
 'coronavirus pandemic',
 'wife',
 '##day',
 'evenings',
 '1',
 'thru',
 'hem',
 'np',
 'co',
 '##vid 19',
 'ga',
 'slight',
 'slight',
 'dry',
 'moon',
 'slight',
 '18 month old',
 'this',
 'corona',
 'cough',
 'headache',
 'nausea',
 'headache',
 'weeks',
 '3',
 'dry',
 'chest',
 'antibotic',
 '10 days',
 '1 week',
 'chest',
 'lungs',
 'x ray',
 '. s',
 'this',
 'ncov',
 'a few hours',
 'nose',
 'mild',
 'tight',
 'stuffy',
 'difficulty',
 'breathing',
 'feverish',
 '10 %',
 'mostly fine',
 'yesterday',
 'this morning',
 'hospital',
 '3 months',
 'boyfriend',
 'grandmother',
 'family',
 'hurt',
 'cough on',
 'one',
 '##ic',
 'mid february',
 

In [None]:
print(len(clinician_entity_terms))
print(len(clinician_mismatch_type))
clinician_entity_terms

2831
2831


['co',
 '##vid - 19',
 'infections',
 '##vid',
 'call',
 'age',
 'over 5 - 14 days',
 'high',
 'risk',
 'persistent',
 'dry',
 'symptoms',
 'tired',
 'dry',
 'fever',
 'in',
 '##fective',
 'self',
 'isolation',
 'symptoms',
 'fever',
 'cough',
 'hygiene',
 'covid - 19',
 'covid',
 '19',
 'video',
 'text chat',
 'worse',
 'hydra',
 'pneumonia',
 'pneumonia',
 'drink',
 'oral',
 'covid 19',
 'covid 19',
 'airborne',
 'probiotic',
 'text chat',
 'video',
 'mask',
 'gown',
 'eye protection',
 'n95',
 'corona',
 'va',
 'daily',
 'co',
 '##vid testing',
 'better',
 'for a week',
 '##k',
 'gee',
 'sa',
 'ni',
 'ji',
 'no',
 'kan',
 'ex',
 'sluim',
 'sl',
 '##uim',
 'all',
 'ho',
 '##oo',
 'moon',
 '##zee',
 'virus',
 'he',
 'allergy',
 'b',
 'throat',
 'throat',
 'throat',
 'throat',
 'st',
 '##p',
 'all',
 'co',
 '##vid -',
 'infections',
 'strep',
 'hoarseness',
 'no kids',
 'video',
 'baby',
 'stay',
 'home',
 'stay',
 'drink',
 'rest',
 'coronavirus',
 'home',
 'welcome',
 'pneumonia',
 '

computing jaccard similarity

In [None]:
pghd_set = set(patient_entity_terms)
clinician_set = set(clinician_entity_terms)

jaccard = len(pghd_set & clinician_set) / len(pghd_set | clinician_set)
print(f"Conceptual Overlap (Jaccard): {jaccard:.3f}")

Conceptual Overlap (Jaccard): 0.126


In [None]:
print(pghd_set-clinician_set)

{'##gs', 'anti his', 'tremendous', 'every years', 'birth', 'sister', 'hurt', 'one month after', '9 days ago', 'psudomonas ae', 'later', 'flat', 'last monday', 'painful', 'act up', 'cruddy', 'come for a while', '##wn', '##aq', 'migraine', 'virus 19', 'burning', '##notty nose', 'day progresses', '##bromyalgia', 'ashmatic', '3 weeks ago', '14 months', 'staph', 'swelling', 'hour and a half', 'cough syrup', 'carona virus', '9 days', '25 yr', '11 & 12 march', '240', 'scar', '##oliar', 'london', 'irritated', '##te', '##mo', 'times', 'pendemic', 'victoria', '##rrhea', 'at night', 'sun', '8 time', 'tr', '##thama', 'urgent care', 'broke', '##ching', 'antihistamine', 'close', 'pnemonoia', 'homeopathy', 'rna', '##sions', 'pe', 'spite', 'expression', '240 2mg', 'in the next week', '##hand', '21 month old', 'family doctor', 'aug', '##years', 'cape town', 'son', 'fi', 'gram', 'great shape', '3 / 5 days', 'british', 'ba', 'friend', 'shoes', 'january 3rd', '25 jan 2020', 'jhb', 'pus', '10 month old', '

using counter to create a dictionary of mismatched words from the patient side vs clinician side save the data

In [None]:
mismatches_1 = pd.DataFrame({
    'word': patient_entity_terms,
    'entity': patient_mismatch_type
})
mismatches_1.to_csv("Combined_BERT_patient_mismatch_terms.csv")

mismatches_2 = pd.DataFrame({
    'word': clinician_entity_terms,
    'entity': clinician_mismatch_type
})
mismatches_2.to_csv("Combined_BERT_clinician_mismatch_terms.csv")

## future:

Showing (through concepts per 100 words): 
QuickUMLS ontology has beeter coveration