In [20]:
import pandas as pd
import re


# 2) Load the MTS-Dialog file
mts_df = pd.read_csv("MTS-Dialog-TrainingSet.csv")

q_df = pd.DataFrame({
    "encounter_id" : [],
    "turn_id" : [],
    "doctor_q" : [],
})

a_df = pd.DataFrame({
    "encounter_id" : [],
    "turn_id" : [],
    "patient_a" : [],
})

encounter_id = 0
turn_id = 0

for _, rec in mts_df.iterrows():

    dialogue = rec["dialogue"]
    segments = re.findall(
        r"(Doctor|Patient|Guest_family(?:_\d+)?):\s*(.*?)\s*(?=Doctor:|Patient:|Guest_family|Guest_clinician|$)",
        dialogue, flags=re.DOTALL
    )

    for speaker, utterance in segments:
        if speaker == "Doctor":
            q_df.loc[len(q_df)] = {
                "encounter_id" : rec["ID"],
                "turn_id" : turn_id,
                "doctor_q" : utterance.strip()
            }
        else:
            a_df.loc[len(a_df)] = {
                "encounter_id" : rec["ID"],
                "turn_id" : turn_id,
                "patient_a" : utterance.strip()
            }      
            turn_id += 1

q_df.to_csv("output/mts_dialog_questions.csv", index=False)
a_df.to_csv("output/mts_dialog_answers.csv", index=False)

q_df
a_df

Unnamed: 0,encounter_id,turn_id,patient_a
0,0,0,I came in for a refill of my blood pressure me...
1,0,1,No.
2,0,2,No.
3,0,3,I am seventy six years old and identify as a w...
4,1,4,Terrible. I'm having the worst headache of my ...
...,...,...,...
5474,1199,5474,No way! How funny. Where from?
5475,1199,5475,No to all of the above.
5476,1200,5476,"Oh, it's a senior citizen house. They have man..."
5477,1200,5477,No.


In [92]:
import json
import re

q_df = pd.read_csv("output/mts_dialog_questions_w_emotes.csv")

with open("revised_input/symptom_pattern_map.json", "r") as f:
    kb_pattern_map = json.load(f)

def extract_concept(text: str) -> str:
    for concept, regex_list in kb_pattern_map.items():
        for pattern in regex_list:
            try:
                if re.search(pattern, text, re.IGNORECASE):
                    return concept
            except: 
                return "none"
    return "none"

# 4) Apply to each question
q_df["finding"] = q_df["doctor_q"].apply(extract_concept)

q_df.to_csv("output/doctor_q_with_emotes_and_findings.csv", index=False)

In [90]:
q_df

Unnamed: 0,encounter_id,turn_id,doctor_q,emote,finding
0,0,0,"What brings you back into the clinic today, miss?",Neutral,none
1,0,1,It looks like Doctor Kumar followed up with yo...,Neutral,none
2,0,2,"Have you had any fever or chills, cough, conge...",Neutral,cough
3,0,3,"Great. Also, for our records, how old are you ...",Affirmative,none
4,1,4,How're you feeling today?,Neutral,none
...,...,...,...,...,...
5804,1199,5475,I'm originally from Kentucky. And I have to do...,Neutral,none
5805,1199,5476,Got it.,Affirmative,none
5806,1200,5476,Looks like the nurse came in and asked you eve...,Neutral,none
5807,1200,5477,Do you smoke?,Neutral,none


In [103]:
import pandas as pd

q_df = pd.read_csv("output/doctor_q_with_emotes_and_findings.csv")
a_df = pd.read_csv("output/mts_dialog_answers.csv")
next_encounter_id = 1

qa_df = q_df.merge(a_df, on=["turn_id", "encounter_id"])


qa_df.to_csv("output/question_answer_pairs.csv", index=False)
qa_df

Unnamed: 0,encounter_id,turn_id,doctor_q,emote,finding,patient_a
0,0,0,"What brings you back into the clinic today, miss?",Neutral,none,I came in for a refill of my blood pressure me...
1,0,1,It looks like Doctor Kumar followed up with yo...,Neutral,none,No.
2,0,2,"Have you had any fever or chills, cough, conge...",Neutral,cough,No.
3,0,3,"Great. Also, for our records, how old are you ...",Affirmative,none,I am seventy six years old and identify as a w...
4,1,4,How're you feeling today?,Neutral,none,Terrible. I'm having the worst headache of my ...
...,...,...,...,...,...,...
5437,1199,5474,"Awesome. I moved here in O nine, so two years ...",Affirmative,none,No way! How funny. Where from?
5438,1199,5475,I'm originally from Kentucky. And I have to do...,Neutral,smoking,No to all of the above.
5439,1200,5476,Looks like the nurse came in and asked you eve...,Neutral,none,"Oh, it's a senior citizen house. They have man..."
5440,1200,5477,Do you smoke?,Neutral,smoking,No.


In [104]:
import pandas as pd
qa_df = pd.read_csv("output/question_answer_pairs.csv")


qa_df = qa_df.set_index("turn_id")
qa_df

Unnamed: 0_level_0,encounter_id,doctor_q,emote,finding,patient_a
turn_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0,0,"What brings you back into the clinic today, miss?",Neutral,none,I came in for a refill of my blood pressure me...
1,0,It looks like Doctor Kumar followed up with yo...,Neutral,none,No.
2,0,"Have you had any fever or chills, cough, conge...",Neutral,cough,No.
3,0,"Great. Also, for our records, how old are you ...",Affirmative,none,I am seventy six years old and identify as a w...
4,1,How're you feeling today?,Neutral,none,Terrible. I'm having the worst headache of my ...
...,...,...,...,...,...
5474,1199,"Awesome. I moved here in O nine, so two years ...",Affirmative,none,No way! How funny. Where from?
5475,1199,I'm originally from Kentucky. And I have to do...,Neutral,smoking,No to all of the above.
5476,1200,Looks like the nurse came in and asked you eve...,Neutral,none,"Oh, it's a senior citizen house. They have man..."
5477,1200,Do you smoke?,Neutral,smoking,No.


In [112]:

# qa_df = qa_df.set_index("turn_id")
# qa_df = qa_df.head(10)
new_df = qa_df

for turn, row in qa_df.iterrows():
    prev_dr_turn = "none"
    prev_pt_resp = "none"   
    if turn > 0 :
        try:
            prev_turn = qa_df.loc[turn - 1]        
            if prev_turn["encounter_id"] == row["encounter_id"]:
                prev_dr_turn = prev_turn["doctor_q"] 
                prev_pt_resp = prev_turn["patient_a"]
        except:
            pass

    
    # print(prev_dr_turn, prev_pt_resp)


    new_df.loc[turn, "prev_doctor_q"] = prev_dr_turn
    new_df.loc[turn, "prev_patient_a"] = prev_pt_resp

df_json = new_df.to_json(orient='records', lines=True)

with open("output/question_answer_dataset.jsonl", "w") as f:
    f.write(df_json + '\n')

new_df.to_csv("output/question_answer_dataset.csv", index=False)

new_df

Unnamed: 0_level_0,encounter_id,doctor_q,emote,finding,patient_a,prev_doctor_q,prev_patient_a
turn_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,0,"What brings you back into the clinic today, miss?",Neutral,none,I came in for a refill of my blood pressure me...,none,none
1,0,It looks like Doctor Kumar followed up with yo...,Neutral,none,No.,"What brings you back into the clinic today, miss?",I came in for a refill of my blood pressure me...
2,0,"Have you had any fever or chills, cough, conge...",Neutral,cough,No.,It looks like Doctor Kumar followed up with yo...,No.
3,0,"Great. Also, for our records, how old are you ...",Affirmative,none,I am seventy six years old and identify as a w...,"Have you had any fever or chills, cough, conge...",No.
4,1,How're you feeling today?,Neutral,none,Terrible. I'm having the worst headache of my ...,none,none
...,...,...,...,...,...,...,...
5474,1199,"Awesome. I moved here in O nine, so two years ...",Affirmative,none,No way! How funny. Where from?,How long-,Moved up here in O seven.
5475,1199,I'm originally from Kentucky. And I have to do...,Neutral,smoking,No to all of the above.,"Awesome. I moved here in O nine, so two years ...",No way! How funny. Where from?
5476,1200,Looks like the nurse came in and asked you eve...,Neutral,none,"Oh, it's a senior citizen house. They have man...",none,none
5477,1200,Do you smoke?,Neutral,smoking,No.,Looks like the nurse came in and asked you eve...,"Oh, it's a senior citizen house. They have man..."


# Format Dialogue from QA Record

In [18]:
import pandas as pd
import re

In [None]:
# Rule-based emotion annotation (simplified version from MEDCOD)
def classify_emotion(text):
    text = text.lower()
    if any(x in text for x in ["sorry", "apologize", "apologies", "i regret"]):
        return "apology"
    elif any(x in text for x in ["that must be hard", "i understand", "that's unfortunate", "i'm sorry to hear", "sounds difficult", "that’s worrisome"]):
        return "empathy"
    elif any(x in text for x in ["thanks", "okay", "got it", "understood", "sure", "great", "noted", "i see", "good"]):
        return "affirmative"
    return "none"

# Heuristic: find prefix statements before questions
def extract_emote_phrase(text):
    # Split on question punctuation or transition
    parts = re.split(r'(?<=[.!?])\s+', text.strip())
    if len(parts) > 1:
        # Return first sentence if it looks like a preface
        if len(parts[0].split()) <= 12:
            return parts[0]
    return None

def yes_no_responses(text) :
    text = text.lower()
    if re.search("\b{x}\b", text, re.IGNORECASE) for x in ["yes", "yeah"]:
        return "yes"
    elif any(re.search("\b{x}\b", text, re.IGNORECASE) for x in ["no", "nope"]):
        return "no"
    return "none"

In [33]:
mts_df = pd.read_csv("MTS-Dialog-TrainingSet.csv")
next_encounter_id = 1

qa_df = pd.DataFrame({
    "encounter_id" : [],
    "doctor_q" : [],
    "patient_a" : [],
    "answer_type" : []
})

for _, row in mts_df.iterrows():
    encounter_id = next_encounter_id
    next_encounter_id += 1
    if "dialogue" not in row or not isinstance(row['dialogue'], str):
        continue
    dialogue = row['dialogue']
    turns = re.findall(r"(Doctor|Patient): ([^:]+?)(?=(?:Doctor|Patient):|$)", dialogue)

    if len(turns) < 2:
        continue

    for i in range(2, len(turns) - 1, 2):
        qa_df.loc[len(qa_df)] = {
            "encounter_id" : encounter_id,
            "doctor_q" : turns[i][1],
            "patient_a" : turns[i+1][1],
            "answer_type" : yes_no_responses(turns[i+1][1])
        }

# qa_df = qa_df[qa_df["answer_type"] != "none"]

qa_df.to_csv("output/question_answer_pairs.csv", index=False)

qa_df



Unnamed: 0,encounter_id,doctor_q,patient_a,answer_type
0,1,It looks like Doctor Kumar followed up with yo...,No. \r\n,none
1,1,"Have you had any fever or chills, cough, conge...",No. \r\n,none
2,1,"Great. Also, for our records, how old are you ...",I am seventy six years old and identify as a w...,none
3,2,"I'm so sorry. Well you are only twenty five, s...",Around eleven in the morning. \r\n,none
4,2,Today? \r\n,Um no yesterday. July thirty first. \r\n,none
...,...,...,...,...
3723,1200,How long- \r\n,Moved up here in O seven. \r\n,none
3724,1200,"Awesome. I moved here in O nine, so two years ...",No way! How funny. Where from?\r\n,none
3725,1200,I'm originally from Kentucky. And I have to do...,No to all of the above. \r\n,none
3726,1201,Do you smoke? \r\n,No. \r\n,none


In [19]:
import pandas as pd
qa_df = pd.read_csv("output/question_answer_pairs.csv")

qa_df

Unnamed: 0,encounter_id,doctor_q,patient_a,answer_type
0,1,It looks like Doctor Kumar followed up with yo...,No. \r\n,no
1,1,"Have you had any fever or chills, cough, conge...",No. \r\n,no
2,1,"Great. Also, for our records, how old are you ...",I am seventy six years old and identify as a w...,none
3,2,"I'm so sorry. Well you are only twenty five, s...",Around eleven in the morning. \r\n,none
4,2,Today? \r\n,Um no yesterday. July thirty first. \r\n,no
...,...,...,...,...
3723,1200,How long- \r\n,Moved up here in O seven. \r\n,none
3724,1200,"Awesome. I moved here in O nine, so two years ...",No way! How funny. Where from?\r\n,no
3725,1200,I'm originally from Kentucky. And I have to do...,No to all of the above. \r\n,no
3726,1201,Do you smoke? \r\n,No. \r\n,no


In [29]:
import json
import re

kb_qa_df = qa_df



with open("revised_input/symptom_pattern_map.json", "r") as f:
    kb_pattern_map = json.load(f)



def extract_concept(text: str) -> str:
    for concept, regex_list in kb_pattern_map.items():
        for pattern in regex_list:
            if re.search("\b{pattern}\b", text, re.IGNORECASE):
                return concept
    return "none"

# 4) Apply to each question
kb_qa_df["kb_concept"] = kb_qa_df["doctor_q"].apply(extract_concept)

kb_qa_df.to_csv("output/qa_with_kb_labels.csv", index=False)


In [28]:
kb_qa_df = pd.read_csv("output/qa_with_kb_labels.csv")

kb_qa_df[kb_qa_df["kb_concept"] != "none"].to_csv("output/qa_labled_subset.csv", index=False)

In [7]:

df = pd.DataFrame({
    "encounter_id" : [],
    "prev_dr_turn" : [], 
    "prev_pt_resp" : [], 
    "answer_type" : [],
    "curr_dr_turn" : [],
    "kb_concept" : [],
    "emote" : [],
    "emote_phrase" : []
})

prev_row = pd.Series()

for index, row in kb_qa_df.iterrows():
    if prev_row.any() and prev_row["encounter_id"] == row["encounter_id"]:
        prev_dr_turn = prev_row["doctor_q"] 
        prev_pt_resp = prev_row["patient_a"]
    else :
        prev_dr_turn = "none"
        prev_pt_resp = "none"       

    emote = classify_emotion(row["doctor_q"])
    phrase = extract_emote_phrase(row["doctor_q"])

    df.loc[len(df)] = {
        "encounter_id" : encounter_id,
        "prev_dr_turn" : prev_dr_turn, 
        "prev_pt_resp" : prev_pt_resp, 
        "answer_type" : row["answer_type"],
        "curr_dr_turn" : row["doctor_q"],
        "kb_concept" : row["kb_concept"],
        "emote" : emote,
        "emote_phrase" : phrase
    }   

    prev_row = row
        

df = df[df["kb_concept"] != "none"]

df.to_csv("output/formatted_dialogue.csv", index=False)

df





Unnamed: 0,encounter_id,prev_dr_turn,prev_pt_resp,answer_type,curr_dr_turn,kb_concept,emote,emote_phrase
1,1201,It looks like Doctor Kumar followed up with yo...,No. \r\n,no,"Have you had any fever or chills, cough, conge...",Fever/Chills,affirmative,
6,1201,July thirty first O eight. Got it. Did it come...,Yeah. \r\n,none,"Are you having any symptoms with it, such as b...",dizziness,none,
69,1201,Did you ever have a feeling where you felt lik...,No. \r\n,no,How about any fatigue or pain? Any frequency i...,fatigue,none,How about any fatigue or pain?
73,1201,none,none,yes,And you got right side shoulder pain? Is this ...,shoulder pain,none,And you got right side shoulder pain?
80,1201,How many episodes of vomiting have you had? \r\n,At least four. \r\n,none,"Any abdominal pain, fever, chill, or other sym...",Fever/Chills,none,
...,...,...,...,...,...,...,...,...
3672,1201,"Have you had a cough, or coughed up any blood....","That's a negative, doctor. \r\n",no,What about shortness of breath while at rest o...,shortness of breath,none,
3675,1201,none,none,no,No nausea and vomiting or black stool? Did you...,nausea and vomiting,none,No nausea and vomiting or black stool?
3678,1201,"Okay, any chest pain?\r\n","Um, no.\r\n",no,"Any shortness of breath, cough or cold like sy...",cough,none,
3686,1201,Let's see what we got here. Your reading says ...,"Yes, I'm exercising as a rule three times ever...",no,Do you feel any tingling or numbness or any ki...,numbness,none,


In [8]:
# Replace 'none' and NaN with empty strings to avoid inserting "none" in dialogue
df_cleaned = df.replace('none', '').fillna('')

# Group by encounter_id to build conversations
grouped = df_cleaned.groupby('encounter_id')

# EOS token for DialoGPT
eos_token = "<|endoftext|>"

# Build formatted dialogues
dialogues = []

for encounter_id, group in grouped:
    turns = []
    for _, row in group.iterrows():
        # Interleave turns
        if row['prev_dr_turn'].strip():
            turns.append(f"Doctor: {row['prev_dr_turn'].strip()}")
        if row['prev_pt_resp'].strip():
            turns.append(f"Patient: {row['prev_pt_resp'].strip()}")
        if row['curr_dr_turn'].strip():
            emote = f"[{row['emote']}]" if row['emote'] else ""
            turns.append(f"Doctor{emote}: {row['curr_dr_turn'].strip()}")

    # Join the turns with EOS separator
    full_dialogue = eos_token.join(turns) + eos_token
    dialogues.append({
        "encounter_id": encounter_id,
        "dialogue": full_dialogue
    })

# Convert to DataFrame
dialogue_df = pd.DataFrame(dialogues)


# Save the processed dialogue to a JSONL file suitable for HuggingFace fine-tuning
output_path = "output/dialogpt_finetune_dataset.jsonl"
dialogue_df.to_json(output_path, orient="records", lines=True)

# Preview the output
dialogue_df.head()


Unnamed: 0,encounter_id,dialogue
0,1201,Doctor: It looks like Doctor Kumar followed up...


# Generate Cases

In [None]:
from MedicalKnowledgeBase import MedicalKnowledgeBase

kb = MedicalKnowledgeBase('output/mimic_4_kb_w_freq.json')

# kb = MedicalKnowledgeBase()
# kb.load_kb()


KeyboardInterrupt: 

In [3]:
import json 

encounter_id = 0
cases_to_generate = 10
cases = []


for i in range(cases_to_generate):
    cases.append({
        "encounter_id" : encounter_id,
        "findings" : kb.get_random_findings(10)
    })
    encounter_id += 1

with open("output/simulated_cases.json", "w") as f:
    json.dump(cases, f, indent=2)

In [17]:
import json 
with open("output/simulated_cases.json") as f:
    cases = json.load(f)

# print(cases)

with open("revised_input/symptom_questions.json") as f:
    questions = json.load(f)

print(questions)

{'abdominal pain': ['Have you been experiencing any abdominal pain recently?', 'When did your abdominal pain start?', 'Are you experiencing abdominal pain?', 'Has your abdominal pain gotten better or worse recently?', 'Do you have any abdominal pain?'], 'anal pain': ['Do you have any anal pain?', 'Has your anal pain gotten better or worse recently?', 'When did your anal pain start?', 'Are you experiencing anal pain?', 'Have you been experiencing any anal pain recently?'], 'ankle pain': ['Have you been experiencing any ankle pain recently?', 'Do you have any ankle pain?', 'Are you experiencing ankle pain?', 'Has your ankle pain gotten better or worse recently?', 'When did your ankle pain start?'], 'arm pain': ['Have you been experiencing any arm pain recently?', 'When did your arm pain start?', 'Are you experiencing arm pain?', 'Do you have any arm pain?', 'Has your arm pain gotten better or worse recently?'], 'back pain': ['Are you experiencing back pain?', 'Do you have any back pain?'

In [None]:
import random

with open("revised_input/symptom_questions.json") as f:
    questions = json.load(f)

print(questions)

def simulate_conversation(kb, case_findings, max_turns=10):
    conversation = []
    current_findings = []
    neg_findings = []
    next_finding = kb.get_random_finding()

    for _ in range(max_turns):  
        # Ask the question
        question = generate_question(next_finding)
        
        # Simulate patient response
        answer = "Yes" if next_finding in case_findings else "No"

        # Save conversation turn
        conversation.append({
            "doctor_q": question,
            "patient_a": answer,
            "finding_queried": next_finding,
            "response_positive": (answer == "Yes")
        })

        # print({
        #     "doctor_q": question,
        #     "patient_a": answer,
        #     "finding_queried": next_finding,
        #     "response_positive": (answer == "Yes")
        # })

        # Update what findings have been asked
        if answer == "Yes":
            current_findings.append(next_finding)
        else:
            neg_findings.append(next_finding)

        # Suggest next finding
        next_finding = kb.suggest_next_finding(current_findings, neg_findings)


        if not next_finding:
            # print("No more findings to ask about.")
            break

    return conversation

def generate_question(finding):
    try: 
        return random.choice(questions[finding])
    except: 
        print("no questions for " + finding)



In [18]:
out = []

for case in cases:
    out.append({
        "encounter_id" : case["encounter_id"],
        "findings" : case["findings"],
        "turns" : simulate_conversation(kb, case["findings"])
    })

out

[{'encounter_id': 0,
  'findings': ['fever',
   'vaginal dryness',
   'nipple discharge',
   'night sweats',
   'unexplained weight loss',
   'leg cramps',
   'throat pain',
   'shoulder pain',
   'elbow pain',
   'vomiting'],
  'turns': [{'doctor_q': 'When did your headache start?',
    'patient_a': 'No',
    'finding_queried': 'headache',
    'response_positive': False},
   {'doctor_q': 'Do you have a fever?',
    'patient_a': 'Yes',
    'finding_queried': 'fever',
    'response_positive': True},
   {'doctor_q': 'When did your shortness of breath start?',
    'patient_a': 'No',
    'finding_queried': 'shortness of breath',
    'response_positive': False},
   {'doctor_q': 'Have you felt nauseous a lot recently?',
    'patient_a': 'No',
    'finding_queried': 'nausea',
    'response_positive': False},
   {'doctor_q': 'Are you experiencing cough?',
    'patient_a': 'No',
    'finding_queried': 'cough',
    'response_positive': False},
   {'doctor_q': 'Have you been experiencing insomnia

In [None]:
def chatbot_loop():
    print("Chatbot: Hello, I'm going to ask you some questions about your symptoms.")
    current_findings = []
    neg_findings = []
    next_finding = kb.get_random_finding()
    while len(current_findings) + len(neg_findings) < 10:

        question = generate_question(next_finding)

        print(f"Chatbot: {question}")
        user_input = input("You: ")
        if any(word in user_input.lower() for word in ["yes", "yeah", "y", "i have", "sure"]):
            current_findings.append(next_finding)
        else:
            neg_findings.append(next_finding)
        
        # Suggest next finding
        next_finding = kb.suggest_next_finding(current_findings, neg_findings)

    print("Chatbot: Thank you for your time.")
    

def generate_question(finding):
    try: 
        return random.choice(questions[finding])
    except: 
        print("no questions for " + finding)

