## Part 1: Reading JSON file into a dataframe and saving/examining it.

In [1]:
import pandas as pd
import json

Large data length and size

In [2]:
from datasets import load_dataset


  from .autonotebook import tqdm as notebook_tqdm


Extract the BigBio dataset

In [3]:

# Load the English MedDialog dataset (BigBio version)
datasets = load_dataset("UCSD26/medical_dialog", "processed.en")

# Available splits: train, validation, test
print(datasets)

You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this dataset from the next major release of `datasets`.
Downloading builder script: 16.3kB [00:00, 16.6MB/s]
Downloading readme: 10.9kB [00:00, 10.1MB/s]
Downloading data: 100%|██████████| 414k/414k [00:00<00:00, 2.73MB/s]
Downloading data: 100%|██████████| 57.7k/57.7k [00:00<00:00, 1.28MB/s]
Downloading data: 100%|██████████| 52.0k/52.0k [00:00<00:00, 1.30MB/s]
Generating train split: 100%|██████████| 482/482 [00:00<00:00, 26252.53 examples/s]
Generating validation split: 100%|██████████| 60/60 [00:00<00:00, 5805.53 examples/s]
Generating test split: 100%|██████████| 61/61 [00:00<00:00, 5960.32 examples/s]

DatasetDict({
    train: Dataset({
        features: ['description', 'utterances'],
        num_rows: 482
    })
    validation: Dataset({
        features: ['description', 'utterances'],
        num_rows: 60
    })
    test: Dataset({
        features: ['description', 'utterances'],
        num_rows: 61
    })
})





In [4]:
dfs = []
for key in datasets:
    df1 = pd.DataFrame(datasets[key])
    dfs.append(df1)
df_mini = pd.concat(dfs)

In [5]:
df_mini.head()

Unnamed: 0,description,utterances
0,throat a bit sore and want to get a good imune...,[patient: throat a bit sore and want to get a ...
1,"hey there i have had cold ""symptoms"" for over ...","[patient: hey there i have had cold ""symptoms""..."
2,i have a tight and painful chest with a dry co...,[patient: i have a tight and painful chest wit...
3,what will happen after the incubation period f...,[patient: what will happen after the incubatio...
4,suggest treatment for pneumonia,[patient: just found out i was pregnant. yeste...


## Further Processing the data: Using the BioBert model on a single data sample

### analyzing the minidata to see how many conversations are there with two utterances (two parts)

First, we need to convert the utterances row to a list

In [6]:
utterance_count_dict = {}
for index, row in df_mini.iterrows():
    utterances = row["utterances"]
    utterance_count = len(utterances)#get the current utterance count 
    if utterance_count not in utterance_count_dict: 
        utterance_count_dict[utterance_count] = 1
    else:
        utterance_count_dict[utterance_count] = utterance_count_dict[utterance_count] + 1

printing the length of utterances:

In [7]:
print(utterance_count_dict)

{2: 600, 17: 1, 4: 1, 8: 1}


As we can see the vast majority of the dataset contains 2 way utterances

### isolating the utterance pairs (they always start with patient)

In [8]:
df_mini_pairs = df_mini[df_mini["utterances"].apply(len) == 2]
print("number of minipairs data:", len(df_mini_pairs))

number of minipairs data: 600


In [9]:
df_mini_pairs.head(n=1)


Unnamed: 0,description,utterances
0,throat a bit sore and want to get a good imune...,[patient: throat a bit sore and want to get a ...


Example reading a line by index

In [10]:
#
index = 0
print("patient text:", df_mini_pairs.iloc[index]["utterances"][0])
print("clinician text:",df_mini_pairs.iloc[index]["utterances"][1])

patient text: patient: throat a bit sore and want to get a good imune booster, especially in light of the virus. please advise. have not been in contact with nyone with the virus.
clinician text: doctor: during this pandemic. throat pain can be from a strep throat infection (antibiotics needed), a cold or influenza or other virus, or from some other cause such as allergies or irritants. usually, a person sees the doctor (call first) if the sore throat is bothersome, recurrent, or doesn't go away quickly. covid-19 infections tend to have cough, whereas strep throat usually lacks cough but has more throat pain. (3/21/20)


Saving the paired data

In [11]:
df_mini_pairs.to_csv("english-train-paired-conversations.csv")

### now, testing BioBert on this data 

We load a fine-tuned version of BioBert

In [5]:
from transformers import pipeline
from transformers import AutoTokenizer, AutoModelForTokenClassification

tokenizer = AutoTokenizer.from_pretrained("d4data/biomedical-ner-all")
model = AutoModelForTokenClassification.from_pretrained("d4data/biomedical-ner-all")

pipe = pipeline("ner", model=model, tokenizer=tokenizer, aggregation_strategy="simple") # pass device=0 if using gpu
pipe("""The patient reported no recurrence of palpitations at follow-up 6 months after the ablation.""")

Device set to use cpu


[{'entity_group': 'Sign_symptom',
  'score': np.float32(0.9999311),
  'word': 'pal',
  'start': 38,
  'end': 41},
 {'entity_group': 'Sign_symptom',
  'score': np.float32(0.90633166),
  'word': '##pitations',
  'start': 41,
  'end': 50},
 {'entity_group': 'Clinical_event',
  'score': np.float32(0.99975544),
  'word': 'follow',
  'start': 54,
  'end': 60},
 {'entity_group': 'Date',
  'score': np.float32(0.999867),
  'word': '6 months after',
  'start': 64,
  'end': 78}]

Testing data on a single datapoint.

Split based on last : found

In [6]:
#
index = 0
patient = df_mini_pairs.iloc[index]["utterances"][0].replace("patient:","").strip() #raw patient string replacing doctor, patient
clinician = df_mini_pairs.iloc[index]["utterances"][1].replace("doctor:","").strip() #raw clinican stinrg
print(patient,"\n",clinician)

NameError: name 'df_mini_pairs' is not defined

In [None]:
pipe(patient)

In [None]:
result = pipe(clinician)
print(result)

In [None]:
result[0]["entity_group"]

# Finding entities: Analyzing through the mini data using BioBert


Finding the entity_groups found for both patient and clinician data, and the count of mapped entities for each patient category

In [None]:
patient_entity_dict = {}
clinician_entity_dict = {}
for index,row in df_mini_pairs.iterrows():
    utterances = row["utterances"]
    
    try:  
        patient =  utterances[0].replace("patient:","").strip()
        patient_results = pipe(patient)
        #loop through the ntities
        for dict in patient_results:
            curr_entity = dict["entity_group"]
            if curr_entity not in patient_entity_dict:
                patient_entity_dict[curr_entity] = 1
            else: 
                patient_entity_dict[curr_entity] = patient_entity_dict[curr_entity] + 1
    except:
        print("no patient mapping at index:", index)
        pass 

    try: 
        clinician = utterances[1].replace("doctor:","").strip()
        clinician_results = pipe(clinician)
        for dict in clinician_results:
            curr_entity = dict["entity_group"]
            if curr_entity not in clinician_entity_dict:
                clinician_entity_dict[curr_entity] = 1
            else: 
                clinician_entity_dict[curr_entity] = clinician_entity_dict[curr_entity] + 1
    except:
        print("no clinican mapping at index:", index)
        pass 


Analyzing the set of patient entity

In [None]:
print(patient_entity_dict)

In [None]:
print(clinician_entity_dict)

Finding the sum of values in total

In [None]:
print("number of patient data:", sum(patient_entity_dict.values()))
print("number of clinicain data:", sum(clinician_entity_dict.values()))

Since 480 total datapoints, on  patient data is mapped : (per dialogue)

In [None]:
4440/480

for clinican data:

In [None]:
4082/480

Sorting based on values

In [None]:
patient_entity_dict_sorted = sorted(patient_entity_dict.items(), key=lambda  item: item[1], reverse=True)
patient_entity_dict_sorted

In [None]:
clinican_entity_dict_sorted = sorted(clinician_entity_dict.items(), key=lambda  item: item[1], reverse=True)
clinican_entity_dict_sorted

Finding the set differences

In [None]:
set(patient_entity_dict.keys()).difference(set(clinician_entity_dict.keys()))

In [None]:
set(clinician_entity_dict.keys()).difference(set(patient_entity_dict.keys()))

We can see that there are few mappings. Additionally, we 

## looping through the minidata to find mismatches between entities the paired data

Now, we can perform a different type of analysis. Now, that we know that there is a discrepancy between total values, what are differences in the occurance per dialogue. We use the list of found entities before 

In [None]:
clinician_entity_dict.keys()

In [None]:
#initialize a dictionary of mismatches 
patient_mismatch_dict = {} #entities mapped in patient text but not in the clinican text
clinician_mismatch_dict = {}  #entities mapped in clinican text but not in the patient text
for key in list(patient_entity_dict.keys()):
    patient_mismatch_dict[key] = 0
for key in list(clinician_entity_dict.keys()):
    clinician_mismatch_dict[key] = 0


In [None]:
#track the row indices with a patient mismatch mapping diex w
patient_mismatch_rows = []
#track the rows with a  clinican mismatch
clinician_mismatch_rows = []

for index,row in df_mini_pairs.iterrows():
    current_patient_entities = set()
    current_clinican_entities = set()
    utterances = row["utterances"]
    
    try:  
        patient =  utterances[0].replace("patient:","").strip()
        patient_results = pipe(patient)
        #loop through the ntities
        for dict in patient_results:
            curr_entity = dict["entity_group"]
            current_patient_entities.add(curr_entity)
    except:
        print("no patient mapping at index:", index)
        pass 

    try: 
        clinician = utterances[1].replace("doctor:","").strip()
        clinician_results = pipe(clinician)
        for dict in clinician_results:
            curr_entity = dict["entity_group"]
            current_clinican_entities.add(curr_entity)
    except:
        print("no clinican mapping at index:", index)
        pass 
    #find mismatches by computing set differneces 
    #entities mapped in patient text but not in the clinican text. increment the dictionary of mismatches 
    patient_mismatch = current_patient_entities - current_clinican_entities
    if len(patient_mismatch) > 0:
        patient_mismatch_rows.append(index) #if the number of mismatches is non zero, append
    for key in patient_mismatch:
        patient_mismatch_dict[key] += 1

    clinician_mismatch = current_clinican_entities - current_patient_entities
    if len(clinician_mismatch) > 0:
        clinician_mismatch_rows.append(index) #if the number of mismatches is non zero, append
    for key in clinician_mismatch:
        clinician_mismatch_dict[key] += 1


Printing sums of total mismatches

In [None]:
print("number of patient data:", sum(patient_mismatch_dict.values()))
print("number of clinician data:", sum(clinician_mismatch_dict.values()))

number of rows of mismatches and percentages

In [None]:
print("percent of rows with patient data mismatches:", len(patient_mismatch_rows)/len(df_mini_pairs)*100)
print("percent of rows with clinician data mismatches:", len(clinician_mismatch_rows)/len(df_mini_pairs)*100)

printing out mismatch dictionary with sorting

In [None]:
patient_mismatch_dict = sorted(patient_mismatch_dict.items(), key=lambda  item: item[1], reverse=True)
patient_mismatch_dict

In [None]:
clinician_mismatch_dict = sorted(clinician_mismatch_dict.items(), key=lambda  item: item[1], reverse=True)
clinician_mismatch_dict

## Looping to extract a set of terms, not entities

Finding a set of medical terms, not entities. This could be large


In [None]:
patient_entity_terms = {}
clinician_entity_terms = {}
for index,row in df_mini_pairs.iterrows():
    utterances = row["utterances"]
    
    try:  
        patient =  utterances[0].replace("patient:","").strip()
        patient_results = pipe(patient)
        #loop through the ntities
        for dict in patient_results:
            curr_entity = dict["word"]
            if curr_entity not in patient_entity_terms:
                patient_entity_terms[curr_entity] = 1
            else: 
                patient_entity_terms[curr_entity] = patient_entity_terms[curr_entity] + 1
    except:
        print("no patient mapping at index:", index)
        pass 

    try: 
        clinician = utterances[1].replace("doctor:","").strip()
        clinician_results = pipe(clinician)
        for dict in clinician_results:
            curr_entity = dict["word"]
            if curr_entity not in clinician_entity_terms:
                clinician_entity_terms[curr_entity] = 1
            else: 
                clinician_entity_terms[curr_entity] = clinician_entity_terms[curr_entity] + 1
    except:
        print("no clinican mapping at index:", index)
        pass 


Printing out side of patient terms and clincians terms and using set similarity

In [None]:
print(len(patient_entity_terms))
patient_entity_terms

In [None]:
print(len(clinician_entity_terms))
clinician_entity_terms

computing jaccard similarity

In [None]:
pghd_set = set(patient_entity_terms.keys())
clinician_set = set(clinician_entity_terms.keys())

jaccard = len(pghd_set & clinician_set) / len(pghd_set | clinician_set)
print(f"Conceptual Overlap (Jaccard): {jaccard:.3f}")

In [None]:
print(pghd_set-clinician_set)

## future:

Showing (through concepts per 100 words): 
QuickUMLS ontology has beeter coveration