In [None]:
import pandas as pd
import re

In [None]:
# Rule-based emotion annotation (simplified version from MEDCOD)
def classify_emotion(text):
    text = text.lower()
    if any(x in text for x in ["sorry", "apologize", "apologies", "i regret"]):
        return "apology"
    elif any(x in text for x in ["that must be hard", "i understand", "that's unfortunate", "i'm sorry to hear", "sounds difficult", "that’s worrisome"]):
        return "empathy"
    elif any(x in text for x in ["thanks", "okay", "got it", "understood", "sure", "great", "noted", "i see", "good"]):
        return "affirmative"
    return "none"

# Heuristic: find prefix statements before questions
def extract_emote_phrase(text):
    # Split on question punctuation or transition
    parts = re.split(r'(?<=[.!?])\s+', text.strip())
    if len(parts) > 1:
        # Return first sentence if it looks like a preface
        if len(parts[0].split()) <= 12:
            return parts[0]
    return None

def yes_no_responses(text) :
    text = re.sub(r"[^A-Za-z ]+", '', text).lower()
    text_split = text.split()
    if any(x in text_split for x in ["yes", "yeah"]):
        return "yes"
    elif any(x in text_split for x in ["no", "nope"]):
        return "no"
    return "none"

In [104]:
mts_df = pd.read_csv("MTS-Dialog-TrainingSet.csv")
next_encounter_id = 1

qa_df = pd.DataFrame({
    "encounter_id" : [],
    "doctor_q" : [],
    "patient_a" : [],
    "answer_type" : []
})

for _, row in mts_df.iterrows():
    encounter_id = next_encounter_id
    next_encounter_id += 1
    if "dialogue" not in row or not isinstance(row['dialogue'], str):
        continue
    dialogue = row['dialogue']
    turns = re.findall(r"(Doctor|Patient): ([^:]+?)(?=(?:Doctor|Patient):|$)", dialogue)

    if len(turns) < 2:
        continue

    for i in range(2, len(turns) - 1, 2):
        qa_df.loc[len(qa_df)] = {
            "encounter_id" : encounter_id,
            "doctor_q" : turns[i][1],
            "patient_a" : turns[i+1][1],
            "answer_type" : yes_no_responses(turns[i+1][1])
        }

# qa_df = qa_df[qa_df["answer_type"] != "none"]

qa_df.to_csv("output/question_answer_pairs.csv", index=False)



In [105]:
qa_df = pd.read_csv("output/question_answer_pairs.csv")

qa_df

Unnamed: 0,encounter_id,doctor_q,patient_a,answer_type
0,1,It looks like Doctor Kumar followed up with yo...,No. \r\n,no
1,1,"Have you had any fever or chills, cough, conge...",No. \r\n,no
2,1,"Great. Also, for our records, how old are you ...",I am seventy six years old and identify as a w...,none
3,2,"I'm so sorry. Well you are only twenty five, s...",Around eleven in the morning. \r\n,none
4,2,Today? \r\n,Um no yesterday. July thirty first. \r\n,no
...,...,...,...,...
3723,1200,How long- \r\n,Moved up here in O seven. \r\n,none
3724,1200,"Awesome. I moved here in O nine, so two years ...",No way! How funny. Where from?\r\n,no
3725,1200,I'm originally from Kentucky. And I have to do...,No to all of the above. \r\n,no
3726,1201,Do you smoke? \r\n,No. \r\n,no


In [106]:
kb_qa_df = qa_df

kb_concept_file = "mayoclinic_symptom_list.txt"
with open(kb_concept_file, "r") as f:
    kb_concept_list = [line.strip().lower() for line in f if line.strip()]

pattern_map = {
    "Fever/Chills": [
        r"\bfever\b",
        r"\bchills\b"
    ]
}
for concept in kb_concept_list:
    pattern_map[concept] = [r'\b(' + concept + r')\b']


def extract_concept(text: str) -> str:
    for concept, regex_list in pattern_map.items():
        for pattern in regex_list:
            if re.search(pattern, text, re.IGNORECASE):
                return concept
    return "none"

# 4) Apply to each question
kb_qa_df["kb_concept"] = kb_qa_df["doctor_q"].apply(extract_concept)

kb_qa_df = kb_qa_df[kb_qa_df["kb_concept"] != "none"]

kb_qa_df.to_csv("output/qa_with_kb_labels.csv", index=False)


In [107]:
kb_qa_df = pd.read_csv("output/qa_with_kb_labels.csv")

kb_qa_df

Unnamed: 0,encounter_id,doctor_q,patient_a,answer_type,kb_concept
0,1,"Have you had any fever or chills, cough, conge...",No. \r\n,no,Fever/Chills
1,2,"Are you having any symptoms with it, such as b...",I'm having blurry vision and lightheadedness. ...,none,dizziness
2,15,How about any fatigue or pain? Any frequency i...,No.,no,fatigue
3,17,And you got right side shoulder pain? Is this ...,Yes. That is right.\r\n,yes,shoulder pain
4,18,"Any abdominal pain, fever, chill, or other sym...",Just nausea and vomiting. It's been so terribl...,none,Fever/Chills
...,...,...,...,...,...
196,1185,What about shortness of breath while at rest o...,"No, that's normal too. I'm breathing just fine.",no,shortness of breath
197,1187,No nausea and vomiting or black stool? Did you...,"Nope, nothing like that.\r\n",no,nausea and vomiting
198,1187,"Any shortness of breath, cough or cold like sy...",Nope.\r\n,no,cough
199,1188,Do you feel any tingling or numbness or any ki...,Nope. \r\n,no,numbness


In [108]:

df = pd.DataFrame({
    "encounter_id" : [],
    "prev_dr_turn" : [], 
    "prev_pt_resp" : [], 
    "answer_type" : [],
    "curr_dr_turn" : [],
    "kb_concept" : [],
    "emote" : [],
    "emote_phrase" : []
})

prev_row = pd.Series()

for index, row in kb_qa_df.iterrows():
    if prev_row.any() and prev_row["encounter_id"] == row["encounter_id"]:
        prev_dr_turn = prev_row["doctor_q"] 
        prev_pt_resp = prev_row["patient_a"]
    else :
        prev_dr_turn = "none"
        prev_pt_resp = "none"       

    emote = classify_emotion(row["doctor_q"])
    phrase = extract_emote_phrase(row["doctor_q"])

    df.loc[len(df)] = {
        "encounter_id" : encounter_id,
        "prev_dr_turn" : prev_dr_turn, 
        "prev_pt_resp" : prev_pt_resp, 
        "answer_type" : row["answer_type"],
        "curr_dr_turn" : row["doctor_q"],
        "kb_concept" : row["kb_concept"],
        "emote" : emote,
        "emote_phrase" : phrase
    }   

    prev_row = row
        

df = df[df["kb_concept"] != "none"]

df.to_csv("output/formatted_dialogue.csv", index=False)

df





Unnamed: 0,encounter_id,prev_dr_turn,prev_pt_resp,answer_type,curr_dr_turn,kb_concept,emote,emote_phrase
0,1201,none,none,no,"Have you had any fever or chills, cough, conge...",Fever/Chills,affirmative,
1,1201,none,none,none,"Are you having any symptoms with it, such as b...",dizziness,none,
2,1201,none,none,no,How about any fatigue or pain? Any frequency i...,fatigue,none,How about any fatigue or pain?
3,1201,none,none,yes,And you got right side shoulder pain? Is this ...,shoulder pain,none,And you got right side shoulder pain?
4,1201,none,none,none,"Any abdominal pain, fever, chill, or other sym...",Fever/Chills,none,
...,...,...,...,...,...,...,...,...
196,1201,"Have you had a cough, or coughed up any blood....","That's a negative, doctor. \r\n",no,What about shortness of breath while at rest o...,shortness of breath,none,
197,1201,none,none,no,No nausea and vomiting or black stool? Did you...,nausea and vomiting,none,No nausea and vomiting or black stool?
198,1201,No nausea and vomiting or black stool? Did you...,"Nope, nothing like that.\r\n",no,"Any shortness of breath, cough or cold like sy...",cough,none,
199,1201,none,none,no,Do you feel any tingling or numbness or any ki...,numbness,none,


In [110]:
# Replace 'none' and NaN with empty strings to avoid inserting "none" in dialogue
df_cleaned = df.replace('none', '').fillna('')

# Group by encounter_id to build conversations
grouped = df_cleaned.groupby('encounter_id')

# EOS token for DialoGPT
eos_token = "<|endoftext|>"

# Build formatted dialogues
dialogues = []

for encounter_id, group in grouped:
    turns = []
    for _, row in group.iterrows():
        # Interleave turns
        if row['prev_dr_turn'].strip():
            turns.append(f"Doctor: {row['prev_dr_turn'].strip()}")
        if row['prev_pt_resp'].strip():
            turns.append(f"Patient: {row['prev_pt_resp'].strip()}")
        if row['curr_dr_turn'].strip():
            emote = f"[{row['emote']}]" if row['emote'] else ""
            turns.append(f"Doctor{emote}: {row['curr_dr_turn'].strip()}")

    # Join the turns with EOS separator
    full_dialogue = eos_token.join(turns) + eos_token
    dialogues.append({
        "encounter_id": encounter_id,
        "dialogue": full_dialogue
    })

# Convert to DataFrame
dialogue_df = pd.DataFrame(dialogues)


# Save the processed dialogue to a JSONL file suitable for HuggingFace fine-tuning
output_path = "output/dialogpt_finetune_dataset.jsonl"
dialogue_df.to_json(output_path, orient="records", lines=True)

# Preview the output
dialogue_df.head()


Unnamed: 0,encounter_id,dialogue
0,1201,Doctor[affirmative]: Have you had any fever or...
