# Format and Label MTS-Dialog Data

## Parse Questions and Answers

In [1]:
import pandas as pd
import re

# Load the MTS-Dialog file
mts_df = pd.read_csv("MTS-Dialog-TrainingSet.csv")

q_df = pd.DataFrame({
    "encounter_id" : [],
    "turn_id" : [],
    "doctor_q" : [],
})

a_df = pd.DataFrame({
    "encounter_id" : [],
    "turn_id" : [],
    "patient_a" : [],
})

encounter_id = 0
turn_id = 0

for _, rec in mts_df.iterrows():

    dialogue = rec["dialogue"]
    segments = re.findall(
        r"(Doctor|Patient|Guest_family(?:_\d+)?):\s*(.*?)\s*(?=Doctor:|Patient:|Guest_family|Guest_clinician|$)",
        dialogue, flags=re.DOTALL
    )

    for speaker, utterance in segments:
        if speaker == "Doctor":
            q_df.loc[len(q_df)] = {
                "encounter_id" : rec["ID"],
                "turn_id" : turn_id,
                "doctor_q" : utterance.strip()
            }
        else:
            a_df.loc[len(a_df)] = {
                "encounter_id" : rec["ID"],
                "turn_id" : turn_id,
                "patient_a" : utterance.strip()
            }      
            turn_id += 1

q_df.to_csv("output/mts_dialog_questions.csv", index=False)
a_df.to_csv("output/mts_dialog_answers.csv", index=False)

## Label Findings

Use the kaggle_zeroshot_qa_labeling.ipynb notebook to label the emotes on mts_dialog_questions.csv and abswer types on mts_dialog_answers.csv before this. In the \output folder, you should now have: 
* mts_dialog_questions_w_emotes.csv
* mts_dialog_answers_with_yn.csv

In [None]:
import pandas as pd
import json
import re

q_df = pd.read_csv("output/mts_dialog_questions_w_emotes.csv")

with open("revised_input/symptom_pattern_map.json", "r") as f:
    kb_pattern_map = json.load(f)

def extract_concept(text: str) -> str:
    for concept, regex_list in kb_pattern_map.items():
        for pattern in regex_list:
            try:
                if re.search(pattern, text, re.IGNORECASE):
                    return concept
            except: 
                return "none"
    return "none"


q_df["finding"] = q_df["doctor_q"].apply(extract_concept)

q_df.to_csv("output/mts_dialog_questions_with_emotes_and_findings.csv", index=False)
q_df

Unnamed: 0,encounter_id,turn_id,doctor_q,emote,finding
0,0,0,"What brings you back into the clinic today, miss?",Neutral,none
1,0,1,It looks like Doctor Kumar followed up with yo...,Neutral,none
2,0,2,"Have you had any fever or chills, cough, conge...",Neutral,cough
3,0,3,"Great. Also, for our records, how old are you ...",Affirmative,none
4,1,4,How're you feeling today?,Neutral,none
...,...,...,...,...,...
5804,1199,5475,I'm originally from Kentucky. And I have to do...,Neutral,smoking
5805,1199,5476,Got it.,Affirmative,none
5806,1200,5476,Looks like the nurse came in and asked you eve...,Neutral,none
5807,1200,5477,Do you smoke?,Neutral,smoking


## Combine Questions and Answer

In [None]:
import pandas as pd

q_df = pd.read_csv("output/mts_dialog_questions_with_emotes_and_findings.csv")
a_df = pd.read_csv("output/mts_dialog_answers_with_yn.csv")
next_encounter_id = 1

qa_df = q_df.merge(a_df, on=["turn_id", "encounter_id"])

qa_df.to_csv("output/question_answer_pairs.csv", index=False)
qa_df

Unnamed: 0,encounter_id,turn_id,doctor_q,emote,finding,patient_a,affirmative
0,0,0,"What brings you back into the clinic today, miss?",Neutral,none,I came in for a refill of my blood pressure me...,Unknown
1,0,1,It looks like Doctor Kumar followed up with yo...,Neutral,none,No.,False
2,0,2,"Have you had any fever or chills, cough, conge...",Neutral,cough,No.,False
3,0,3,"Great. Also, for our records, how old are you ...",Affirmative,none,I am seventy six years old and identify as a w...,Unknown
4,1,4,How're you feeling today?,Neutral,none,Terrible. I'm having the worst headache of my ...,Unknown
...,...,...,...,...,...,...,...
5437,1199,5474,"Awesome. I moved here in O nine, so two years ...",Affirmative,none,No way! How funny. Where from?,Unknown
5438,1199,5475,I'm originally from Kentucky. And I have to do...,Neutral,smoking,No to all of the above.,False
5439,1200,5476,Looks like the nurse came in and asked you eve...,Neutral,none,"Oh, it's a senior citizen house. They have man...",Unknown
5440,1200,5477,Do you smoke?,Neutral,smoking,No.,False


## Create Final Dataset with Previous Responses

In [5]:
import pandas as pd

qa_df = pd.read_csv("output/question_answer_pairs.csv")
qa_df = qa_df.set_index("turn_id")
qa_df

Unnamed: 0_level_0,encounter_id,doctor_q,emote,finding,patient_a,affirmative
turn_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,0,"What brings you back into the clinic today, miss?",Neutral,none,I came in for a refill of my blood pressure me...,Unknown
1,0,It looks like Doctor Kumar followed up with yo...,Neutral,none,No.,False
2,0,"Have you had any fever or chills, cough, conge...",Neutral,cough,No.,False
3,0,"Great. Also, for our records, how old are you ...",Affirmative,none,I am seventy six years old and identify as a w...,Unknown
4,1,How're you feeling today?,Neutral,none,Terrible. I'm having the worst headache of my ...,Unknown
...,...,...,...,...,...,...
5474,1199,"Awesome. I moved here in O nine, so two years ...",Affirmative,none,No way! How funny. Where from?,Unknown
5475,1199,I'm originally from Kentucky. And I have to do...,Neutral,smoking,No to all of the above.,False
5476,1200,Looks like the nurse came in and asked you eve...,Neutral,none,"Oh, it's a senior citizen house. They have man...",Unknown
5477,1200,Do you smoke?,Neutral,smoking,No.,False


In [12]:
new_df = qa_df

for turn, row in qa_df.iterrows():
    prev_dr_turn = "none"
    prev_pt_resp = "none"   
    prev_affirmative = "none"  
    prev_finding = "none"  
    if turn > 0 :
        try:
            prev_turn = qa_df.loc[turn - 1]        
            if prev_turn["encounter_id"] == row["encounter_id"]:
                prev_dr_turn = prev_turn["doctor_q"] 
                prev_pt_resp = prev_turn["patient_a"]
                prev_affirmative = prev_turn["affirmative"]
                prev_finding = prev_turn["finding"]
        except:
            pass


    new_df.loc[turn, "prev_doctor_q"] = prev_dr_turn
    new_df.loc[turn, "prev_patient_a"] = prev_pt_resp
    new_df.loc[turn, "prev_finding"] = prev_finding
    new_df.loc[turn, "prev_affirmative"] = prev_affirmative

df_json = new_df.to_json(orient='records', lines=True)

with open("output/question_answer_dataset.jsonl", "w") as f:
    f.write(df_json + '\n')

new_df.to_csv("output/question_answer_dataset.csv", index=False)
new_df

Unnamed: 0_level_0,encounter_id,doctor_q,emote,finding,patient_a,affirmative,prev_doctor_q,prev_patient_a,prev_finding,prev_affirmative
turn_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
0,0,"What brings you back into the clinic today, miss?",Neutral,none,I came in for a refill of my blood pressure me...,Unknown,none,none,none,none
1,0,It looks like Doctor Kumar followed up with yo...,Neutral,none,No.,False,"What brings you back into the clinic today, miss?",I came in for a refill of my blood pressure me...,none,Unknown
2,0,"Have you had any fever or chills, cough, conge...",Neutral,cough,No.,False,It looks like Doctor Kumar followed up with yo...,No.,none,False
3,0,"Great. Also, for our records, how old are you ...",Affirmative,none,I am seventy six years old and identify as a w...,Unknown,"Have you had any fever or chills, cough, conge...",No.,cough,False
4,1,How're you feeling today?,Neutral,none,Terrible. I'm having the worst headache of my ...,Unknown,none,none,none,none
...,...,...,...,...,...,...,...,...,...,...
5474,1199,"Awesome. I moved here in O nine, so two years ...",Affirmative,none,No way! How funny. Where from?,Unknown,How long-,Moved up here in O seven.,none,Unknown
5475,1199,I'm originally from Kentucky. And I have to do...,Neutral,smoking,No to all of the above.,False,"Awesome. I moved here in O nine, so two years ...",No way! How funny. Where from?,none,Unknown
5476,1200,Looks like the nurse came in and asked you eve...,Neutral,none,"Oh, it's a senior citizen house. They have man...",Unknown,none,none,none,none
5477,1200,Do you smoke?,Neutral,smoking,No.,False,Looks like the nurse came in and asked you eve...,"Oh, it's a senior citizen house. They have man...",none,Unknown


## Combined with Simulated Cases

Make sure to run generate_clinical_dialogue.py first to get output/generated_cases.csv

In [13]:
import pandas as pd
generated_df = pd.read_csv("output/generated_cases.csv")

new_df = pd.concat([new_df, generated_df], ignore_index=True)

In [14]:
df_json = new_df.to_json(orient='records', lines=True)

with open("output/question_answer_dataset.jsonl", "w") as f:
    f.write(df_json + '\n')

new_df.to_csv("output/question_answer_dataset.csv", index=False)
new_df

Unnamed: 0,encounter_id,doctor_q,emote,finding,patient_a,affirmative,prev_doctor_q,prev_patient_a,prev_finding,prev_affirmative
0,0,"What brings you back into the clinic today, miss?",Neutral,none,I came in for a refill of my blood pressure me...,Unknown,none,none,none,none
1,0,It looks like Doctor Kumar followed up with yo...,Neutral,none,No.,False,"What brings you back into the clinic today, miss?",I came in for a refill of my blood pressure me...,none,Unknown
2,0,"Have you had any fever or chills, cough, conge...",Neutral,cough,No.,False,It looks like Doctor Kumar followed up with yo...,No.,none,False
3,0,"Great. Also, for our records, how old are you ...",Affirmative,none,I am seventy six years old and identify as a w...,Unknown,"Have you had any fever or chills, cough, conge...",No.,cough,False
4,1,How're you feeling today?,Neutral,none,Terrible. I'm having the worst headache of my ...,Unknown,none,none,none,none
...,...,...,...,...,...,...,...,...,...,...
15437,999,Have you been experiencing any nasal congestio...,Neutral,nasal congestion,No.,False,When did your foot pain start?,No.,foot pain,False
15438,999,Have you been experiencing insomnia?,Neutral,insomnia,No.,False,Have you been experiencing any nasal congestio...,No.,nasal congestion,False
15439,999,Do you have cold hands?,Neutral,cold hands,No.,False,Have you been experiencing insomnia?,No.,insomnia,False
15440,999,Are you being treated for blood clots?,Neutral,blood clots,No.,False,Do you have cold hands?,No.,cold hands,False


## Alternative heuristic based methods for labeling emotes and affirmative answers

In [18]:
import pandas as pd
import re

In [None]:
# rule-based emotion annotation 
def classify_emotion(text):
    text = text.lower()
    if any(x in text for x in ["sorry", "apologize", "apologies", "i regret"]):
        return "apology"
    elif any(x in text for x in ["that must be hard", "i understand", "that's unfortunate", "i'm sorry to hear", "sounds difficult", "that’s worrisome"]):
        return "empathy"
    elif any(x in text for x in ["thanks", "okay", "got it", "understood", "sure", "great", "noted", "i see", "good"]):
        return "affirmative"
    return "none"

# find prefix statements before questions
def extract_emote_phrase(text):
    # Split on question punctuation or transition
    parts = re.split(r'(?<=[.!?])\s+', text.strip())
    if len(parts) > 1:
        # Return first sentence if it looks like a preface
        if len(parts[0].split()) <= 12:
            return parts[0]
    return None

# rule based yes/no annotation
def yes_no_responses(text) :
    text = text.lower()
    if re.search("\b{x}\b", text, re.IGNORECASE) for x in ["yes", "yeah"]:
        return "yes"
    elif any(re.search("\b{x}\b", text, re.IGNORECASE) for x in ["no", "nope"]):
        return "no"
    return "none"