In [1]:
import json
import re

# Function to load JSON from a file and extract speaker-text pairs
def extract_speaker_text(json_path, doc_name, pat_name):
    try:
        # Load JSON data from the file
        with open(json_path, 'r', encoding='utf-8') as file:
            data = json.load(file)

        corrected_conversations = []
        buffer_text = ""

        for entry in data:
            if "speaker" in entry and "text" in entry:
                speaker = entry["speaker"]
                
                # Skip entries where speaker is "Measurement"
                if speaker == "Measurement":
                    continue
                    
                if speaker == "Doctor":
                    speaker = doc_name
                elif speaker == "Patient":
                    speaker = pat_name
                    
                text = buffer_text + entry["text"]
                buffer_text = ""
                
                text = re.sub(r"\bREQUEST TEST:\s*", "", text)

                # Regex to check if text contains an unintended speaker mention
                match = re.search(r"\n\n(Doctor|Patient):", text)
                if match:
                    split_index = match.start()
                    current_text = text[:split_index].strip()
                    next_text = text[split_index:].strip()

                    # Remove the "Doctor: " or "Patient: " prefix from the next text
                    next_text = re.sub(r"^(Doctor|Patient):\s*", "", next_text).strip()

                    # Add current speaker's corrected text
                    corrected_conversations.append([speaker, current_text])

                    # Buffer the next speaker's text for the next entry with a space
                    buffer_text = next_text + " "
                else:
                    corrected_conversations.append([speaker, text])
                    
        # Remove duplicate consecutive speaker entries (keep only the latest one)
        deduplicated_conversations = []
        for i in range(len(corrected_conversations)):
            if i > 0 and corrected_conversations[i][0] == corrected_conversations[i - 1][0]:
                # Remove the previous entry and add the current one
                deduplicated_conversations.pop()  # Remove the previous duplicate
            deduplicated_conversations.append(corrected_conversations[i])

        corrected_conversations = deduplicated_conversations
                    
        # Process last element to remove "DIAGNOSIS READY:" section
        last_speaker, last_text = corrected_conversations[-1]
        diagnosis_pattern = re.search(r"DIAGNOSIS READY:.*?\n\n", last_text, re.DOTALL)
        if diagnosis_pattern:
            last_text = last_text.replace(diagnosis_pattern.group(), "").strip()
            corrected_conversations[-1] = [last_speaker, last_text]

        return corrected_conversations

    except FileNotFoundError:
        print(f"Error: The file '{json_path}' was not found.")
        return []
    except json.JSONDecodeError:
        print(f"Error: Failed to decode JSON from '{json_path}'.")
        return []

In [8]:
import glob
import os

# Specify the path to the JSON file
idx = 0
json_paths = "D:/Desktop/Medical_Project/TOQ/output/"
json_files = glob.glob(os.path.join(json_paths, "*.json"))

# Extract and print the results
result = extract_speaker_text(json_files[idx], 'Maria Lopez', 'Klaus Mueller')
print(result)
for res in result:
    print(res)
    print()

[['Maria Lopez', 'Can you tell me more about the onset of your gait and limb ataxia? Did it start suddenly or gradually develop over time?'], ['Klaus Mueller', 'It started with me just feeling really unsteady when I walk, like I was going to stumble or trip at any moment. At first, I thought it was just my balance, but then I noticed my friends and family saying I looked like I was drunk, even when I was sober.'], ['Maria Lopez', 'When did you first notice the symptoms, and have you experienced any other neurological issues, such as numbness, weakness, or vision changes?'], ['Klaus Mueller', "It started about 2 months ago, to be exact. I remember it was a normal day, and then suddenly I felt this weird unsteadiness when I stood up from a chair or walked down the stairs. It's been getting worse, and now I feel like I'm constantly fighting to stay upright."], ['Maria Lopez', "Can you walk for me, please, and show me how you normally move? I'd like to see if I can identify any patterns or