# Load Dependencies

In [None]:
import pandas as pd
import os
import re
import time
from math import ceil
import json
import numpy as np

In [None]:
from azure.identity import DefaultAzureCredential, get_bearer_token_provider
import openai
from concurrent.futures import ThreadPoolExecutor, as_completed


In [None]:
default_credential = DefaultAzureCredential()
token_provider = get_bearer_token_provider(default_credential, "https://cognitiveservices.azure.com/.default" )

api_type = "azure"
api_base = "https://mgh-mind-data-science-private-e2-openai-service.openai.azure.com/" 
api_version = "2024-05-13-preview"

client = openai.AzureOpenAI(api_version=api_version, azure_endpoint=api_base, azure_ad_token_provider=token_provider )

# Load and process data

In [None]:
df_note = pd.read_csv('../../EDW Queries/Moura et al data-all-notes/Formatted_3year_moura_patient_notes_all.csv')
df_dx = pd.read_csv('../../EDW Queries/Moura et al data-all-notes/patient_dx.csv')

In [None]:
df_note = df_note.merge(df_dx, on='PatientID', how='inner')
df_note_grouped = df_note.groupby('PatientID')['NoteTXT'].apply(' '.join).reset_index()
df_note_grouped

In [None]:
def chunk_notes(notes, word_limit=32000):
    """
    Splits patient notes into chunks without exceeding a word limit, ensuring that 
    a single note is not split across multiple chunks unless necessary.
    
    Parameters:
    notes (list of str): List of patient notes.
    word_limit (int): Maximum number of words per chunk.
    
    Returns:
    list of str: List of text chunks.
    """

    chunks = []
    current_chunk = []
    current_word_count = 0

    for note in notes:
        note = " ".join(note.split())
        note_word_count = len(note.split())

        if note_word_count > word_limit:
            # If a single note is too long, split it at sentence level
            sentences = re.split(r'(?<=[.!?]) +', note)  # Split into sentences
            temp_chunk = []
            temp_word_count = 0

            for sentence in sentences:
                sentence_word_count = len(sentence.split())

                if temp_word_count + sentence_word_count > word_limit:
                    chunks.append(" ".join(temp_chunk).strip())
                    temp_chunk = [sentence]  # Start a new chunk
                    temp_word_count = sentence_word_count
                else:
                    temp_chunk.append(sentence)
                    temp_word_count += sentence_word_count

            if temp_chunk:
                chunks.append(" ".join(temp_chunk).strip())

        else:
            # If adding this note exceeds the limit, start a new chunk
            if current_word_count + note_word_count > word_limit:
                chunks.append(" ".join(current_chunk).strip())  # Save previous chunk
                current_chunk = [note]  # Start new chunk
                current_word_count = note_word_count
            else:
                current_chunk.append(note)
                current_word_count += note_word_count

    # Add the last chunk if it exists
    if current_chunk:
        chunks.append(" ".join(current_chunk).strip())

    return chunks


In [None]:
df_note_grouped['NoteChunk'] = df_note_grouped['NoteTXT'].apply(chunk_notes)
df_note_grouped.to_csv("../../EDW Queries/Moura et al data-all-notes/grouped_patient_notes_dx.csv", index=False)

In [None]:
df_note_grouped = pd.read_csv("../../EDW Queries/Moura et al data-all-notes/grouped_patient_notes_dx.csv")

# Summary of Summaries

## Summaries step

In [None]:
def summary_note_chunk(chunk_text):
    while True:
        try:
            response = client.chat.completions.create(
                model="GPT-4o-model",
                messages=[
                    {"role": "system", 
                        "content": "You are a neurologist tasked with summarizing patient notes to assess cognitive impairment."},
                    {"role": "user", 
                        "content": f"""Here are the patient notes:
                        {chunk_text}
                    Please provide a concise summary focusing on the patient's cognitive health and any significant changes or diagnoses made during this period. """}
                ],
                temperature=0
            )
            return response.choices[0].message.content
        except:
            print("Rate limit exceeded. Retrying in 60 seconds...")
            time.sleep(60)

## Summarize summaries step

### Attempt 1

In [None]:
def summary_of_summary(summary):

    while True:
        try:
            response = client.chat.completions.create(
                model="gpt-4o-model2",
                messages = [
                    {"role": "system", "content": "You are a neurologist tasked with synthesizing summaries of patient notes to assess cognitive impairment over a three-year period."},
                    {"role": "user", 
                    "content": f"""Here are the summaries in chronological order:
                    {summary}
                    Based on these summaries, provide an overall classification of the patient's cognitive health over the three-year period into one of the following syndromic diagnoses:
                    1. Normal
                    2. MCI 
                    3. Dementia
                    Provide a rationale for your classification.
                    Follow this format and do not include patient name in response: 
                    **syndromic diagnosis:**[Insert one of 3 categories above here]**
                    **summarized reasons:**[Insert the summary of reasoning here]
                    """}
                ],
                temperature=0,
                max_tokens=4096
            )
            return response.choices[0].message.content
        
        except Exception as e:
            print(f"An error occurred: {e}")
            print("Rate limit exceeded. Retrying in 60 seconds...")
            time.sleep(60)

In [None]:
def extract_syndromic_dx(response):
    # Define the regex pattern to match the confidence level
    pattern = r'\*\*syndromic diagnosis:\*\* (Normal|MCI|Dementia)'
    # Search for the pattern in the response
    match = re.search(pattern, response, re.IGNORECASE)
    # Extract and return the confidence level if found
    if match:
        return match.group(1).lower()  # Convert to lowercase for consistency
    else:
        return None

In [None]:
def convert_dx_to_category(value):
    if value == 0:
        return 'normal'
    elif value == 2 or value == 3:
        return 'mci'
    elif value == 4:
        return 'dementia'

### Attempt 2 - Confidence level

In [None]:
def summary_of_summary(summary):

    while True:
        try:
            response = client.chat.completions.create(
                model="gpt-4o-model2",
                messages = [
                    {"role": "system", "content": "You are a neurologist tasked with synthesizing summaries of patient notes to assess cognitive impairment over a three-year period."},
                    {"role": "user", 
                    "content": f"""Here are the summaries in chronological order:
                    {summary}
                    Based on these summaries, provide an overall classification of the patient's cognitive health over the three-year period into one of the following syndromic diagnoses:
                    1. Normal
                    2. MCI 
                    3. Dementia
                    Provide a rationale for your classification. Additionally, provide a confidence score for the classification on a scale from 1 to 100, where 0 means completely uncertain and 100 means absolutely certain. Be very conservative with the scores.
                    Follow this format and do not include patient name in response: 
                    **syndromic diagnosis:**[Insert one of 3 categories above here]**
                    **confidence level:**[Insert confidence score 1-100 here]**
                    **summarized reasons:**[Insert the summary of reasoning here]
                    """}
                ],
                temperature=0,
                max_tokens=4096
            )
            return response.choices[0].message.content
        
        except Exception as e:
            print(f"An error occurred: {e}")
            print("Rate limit exceeded. Retrying in 60 seconds...")
            time.sleep(60)

In [None]:
def extract_confidence_level(response):
    match = re.search(r"\*\*CDR Score:\*\*\s*(\d(\.\d)?)", response)
    if match:
        return match.group(1)
    return None

### Attempt 3 - separate summary and dx

In [None]:
def summary_of_summaries(summaries):

    while True:
        try:
            response = client.chat.completions.create(
                model="gpt-4o-model2",
                messages = [
                    {"role": "system", "content": "You are a neurologist tasked with synthesizing summaries of patient notes to assess cognitive impairment over a three-year period."},
                    {"role": "user", 
                    "content": f"""Here are the summaries in chronological order:
                    {summaries}
                    Provide a summary of these summaries of the patient's cognitive health over the three-year period.
                    """}
                ],
                temperature=0,
                max_tokens=4096
            )
            return response.choices[0].message.content
        
        except Exception as e:
            print(f"An error occurred: {e}")
            print("Rate limit exceeded. Retrying in 60 seconds...")
            time.sleep(60)

In [None]:
def dx_of_summary(summary):

    while True:
        try:
            response = client.chat.completions.create(
                model="gpt-4o-model2",
                messages = [
                    {"role": "system", "content": "You are a neurologist tasked with making syndromic diagnoses of cognitive impairmen using summaries of patient notes over a three-year period."},
                    {"role": "user", 
                    "content": f"""Here is the summary:
                    {summary}
                    Based on the summary, provide an overall classification of the patient's cognitive health over the three-year period into one of the following syndromic diagnoses:
                    1. Normal
                    2. MCI 
                    3. Dementia
                    Provide a rationale for your classification. Additionally, provide a confidence score for the classification on a scale from 1 to 100, where 0 means completely uncertain and 100 means absolutely certain. Be very conservative with the scores.
                    Follow this format and do not include patient name in response: 
                    **syndromic diagnosis:**[Insert one of 3 categories above here]**
                    **confidence level:**[Insert confidence score 1-100 here]**
                    **justification:** [Insert a few sentences summarizing key observations and reasoning here]**
                    """}
                ],
                temperature=0,
                max_tokens=4096
            )
            return response.choices[0].message.content
        
        except Exception as e:
            print(f"An error occurred: {e}")
            print("Rate limit exceeded. Retrying in 60 seconds...")
            time.sleep(60)

### Attempt 4 - all 5 classes

In [None]:
def summary_of_summary(summary):

    while True:
        try:
            response = client.chat.completions.create(
                model="gpt-4o-model2",
                messages = [
                    {"role": "system", "content": "You are a neurologist tasked with synthesizing summaries of patient notes to assess cognitive impairment over a three-year period."},
                    {"role": "user", 
                    "content": f"""Here are the summaries in chronological order:
                    {summary}
                    Based on these summaries, provide an overall classification of the patient's cognitive health over the three-year period into one of the following syndromic diagnoses:
                    1. Normal
                    2. Normal vs MCI (Mild Cognitive Impairment)
                    3. MCI
                    4. MCI vs Dementia
                    5. Dementia
                    Provide a rationale for your classification.
                    Follow this format and do not include patient name in response: 
                    **Syndromic Diagnosis:**[Insert one of 5 categories above here]**
                    **Justifications:**[Insert the summary of reasoning here]
                    """}
                ],
                temperature=0,
                max_tokens=4096
            )
            return response.choices[0].message.content
        
        except Exception as e:
            print(f"An error occurred: {e}")
            print("Rate limit exceeded. Retrying in 60 seconds...")
            time.sleep(60)

### Attempt 5 - Logprob Revision 

In [None]:
def summary_of_summary(summary):
    while True:
        try:
            response = client.chat.completions.create(
                model="gpt-4o-model2",
                messages = [
                    {"role": "system", "content": "You are a neurologist tasked with making syndromic diagnoses of cognitive impairmen using summaries of patient notes over a three-year period."},
                    {"role": "user", 
                    "content": f"""Here are the summaries in chronological order:
                    {summary}
                    Based on these summaries, provide an overall classification of the patient's cognitive health over the three-year period into one of the following syndromic diagnoses:
                    1. Normal
                    2. MCI 
                    3. Dementia
                    Provide the response with just one number of one of the three categories above here, without any additional text or formatting.
                    """}
                ],
                temperature = 0,
                logprobs=True,
                top_logprobs=2,
                max_tokens=1
            )
            # Extract the classification (e.g., "1", "2", or "3")
            predicted_class = response.choices[0].message.content.strip()
            
            # Extract top logprobs
            top_logprobs = response.choices[0].logprobs.content[0].top_logprobs
            
            # Format logprobs
            formatted_logprobs = [
                {
                    "token": logprob.token,
                    "logprob": logprob.logprob,
                    "probability_percent": np.round(np.exp(logprob.logprob)*100,2)
                }
                for logprob in top_logprobs
            ]
            
            return predicted_class, formatted_logprobs
            # return response
        
        
        except Exception as e:
            print(f"An error occurred: {e}")
            print("Rate limit exceeded. Retrying in 60 seconds...")
            time.sleep(60)

In [None]:
summary_txt = "../GPT/Results - sydronmic dx/Summaries of Notes/Z6354035_summaries.txt"
with open(summary_txt, 'r') as file:
    patient_summary = file.read()
output = summary_of_summary(patient_summary)
print(output)


### Attempt 6 - 5 Class Revision

In [None]:
def summary_of_summary(summary):

    while True:
        try:
            response = client.chat.completions.create(
                model="gpt-4o-model2",
                messages = [
                    {"role": "system", "content": "You are a neurologist tasked with making syndromic diagnoses of cognitive impairmen using summaries of patient notes over a three-year period."},
                    {"role": "user", 
                    "content": f"""Here are the summaries in chronological order:
                    {summary}
                    Based on these summaries, provide an overall classification of the patient's cognitive health over the three-year period into one of the following syndromic diagnoses:
                    1. Normal
                    2. Normal vs MCI
                    3. MCI
                    4. MCI vs Dementia
                    5. Dementia
                    Provide a rationale for your classification. Additionally, provide a confidence score for the classification on a scale from 1 to 100, where 0 means completely uncertain and 100 means absolutely certain. Be very conservative with the scores.
                    Follow this format and do not include patient name in response: 
                    **syndromic diagnosis:**[Insert one of 5 categories above here]**
                    **confidence level:**[Insert confidence score 1-100 here]**
                    **justification:** [Insert a few sentences summarizing key observations and reasoning here]**
                    """}
                ],
                temperature = 0
            )
            return response.choices[0].message.content
        
        except Exception as e:
            print(f"An error occurred: {e}")
            print("Rate limit exceeded. Retrying in 60 seconds...")
            time.sleep(60)

In [None]:
input_dir = "../GPT/Results - sydronmic dx/Summaries of Notes"
for patient_id in df_note_grouped['PatientID'].unique():
    # Check if the summary file already exists
    filename = os.path.join(input_dir, f'{patient_id}_summaries.txt')
    if os.path.exists(filename):
        # print(f"File {filename} exists. Skipping patient {patient_id}...")
        continue
    else:
        print(f"File {filename} does not exist.")

### Attempt 7 - 5 Class Logprobs

In [None]:
def summary_of_summary(summary):
    while True:
        try:
            response = client.chat.completions.create(
                model="gpt-4o-model2",
                messages = [
                    {"role": "system", "content": "You are a neurologist tasked with making syndromic diagnoses of cognitive impairmen using summaries of patient notes over a three-year period."},
                    {"role": "user", 
                    "content": f"""Here are the summaries in chronological order:
                    {summary}
                    Based on these summaries, provide an overall classification of the patient's cognitive health over the three-year period into one of the following syndromic diagnoses:
                    1. Normal
                    2. Normal vs MCI
                    3. MCI
                    4. MCI vs Dementia
                    5. Dementia
                    Provide the response with just one number of one of the five categories above here, without any additional text or formatting.
                    """}
                ],
                temperature = 0,
                logprobs=True,
                top_logprobs=2,
                max_tokens=1
            )
            predicted_class = response.choices[0].message.content.strip()
            
            top_logprobs = response.choices[0].logprobs.content[0].top_logprobs
            
            formatted_logprobs = [
                {
                    "token": logprob.token,
                    "logprob": logprob.logprob,
                    "probability_percent": np.round(np.exp(logprob.logprob)*100,2)
                }
                for logprob in top_logprobs
            ]
            
            return predicted_class, formatted_logprobs
            # return response
        
        
        except Exception as e:
            print(f"An error occurred: {e}")
            print("Rate limit exceeded. Retrying in 60 seconds...")
            time.sleep(60)

## Combine and run full experiments

### Parallelization

In [None]:
notes = df_note_grouped['NoteChunk'].tolist()
# notes

In [None]:
def summary_note_chunk(chunk_text):
    while True:
        try:
            response = client.chat.completions.create(
                model="gpt-4o-model2",
                messages=[
                    {"role": "system", 
                        "content": "You are a neurologist tasked with summarizing patient notes to assess cognitive impairment."},
                    {"role": "user", 
                        "content": f"""Here are the patient notes:
                        {chunk_text}
                    Please provide a concise summary focusing on the patient's cognitive health and any significant changes or diagnoses made during this period."""}
                ]
            )
            return response.choices[0].message.content
        except:
            print("Rate limit exceeded. Retrying in 60 seconds...")
            time.sleep(60)

In [None]:
def process_single_patient_save(patient_id, chunked_note, output_dir= "../GPT/Results - sydronmic dx/Summaries of Notes"):
    single_patient_summary_list = []
    
    with ThreadPoolExecutor(max_workers=8) as executor:
        future_to_summary = {executor.submit(summary_note_chunk, text): text for text in chunked_note}
        
        for future in as_completed(future_to_summary):
            single_summary_output = future.result()
            single_patient_summary_list.append(single_summary_output)
    
    # Join all the summaries for this patient
    single_patient_joined_summary = "\n\n".join(single_patient_summary_list)
    patient_summary_file = os.path.join(output_dir, f'{patient_id}_summaries.txt')
    with open(patient_summary_file, 'w') as file:
        file.write(single_patient_joined_summary)
    
    # Get the final diagnosis (summary of summaries)
    dx_output = summary_of_summary(single_patient_joined_summary)
    
    return patient_id, dx_output


In [None]:
def process_multiple_patients_in_batches(df, save_path, group_size=10, start_group_index=0):
    total_patients = len(df)
    num_groups = ceil(total_patients / group_size)
    
    for group_index in range(start_group_index, num_groups):
        start_index = group_index * group_size
        end_index = min((group_index + 1) * group_size, total_patients)

        print(f"Processing group {group_index + 1} of {num_groups}...")
        patient_summaries = {}
        batch_df = df.iloc[start_index:end_index]
        
        with ThreadPoolExecutor(max_workers=8) as executor:
            future_to_patient = {executor.submit(process_single_patient_save, patient_id, chunked_note): patient_id 
                                 for patient_id, chunked_note in zip(batch_df['PatientID'], batch_df['NoteChunk'])}
            
            for future in as_completed(future_to_patient):
                patient_id, dx_output = future.result()
                patient_summaries[patient_id] = dx_output
        
        filename = f"patient_group_{group_index + 1}_summaries.json"
        file_path = os.path.join(save_path, filename)
        # Save the results of this batch to a JSON file
        with open(file_path, 'w') as json_file:
            json.dump(patient_summaries, json_file, indent=4)
        
        print(f"Group {group_index + 1} processed and saved.")
        time.sleep(60)

In [None]:
save_path = "../GPT/Results - sydronmic dx/GPT4o - Attempt 2"
# If starting from the beginning:
process_multiple_patients_in_batches(df_note_grouped, save_path, group_size=20, start_group_index=16)


### Summary from text

In [None]:
def read_summary_from_file(patient_id, input_dir='summaries'):
    # Read the summary from the file
    summary_file_path = os.path.join(input_dir, f'{patient_id}_summaries.txt')
    with open(summary_file_path, 'r') as file:
        patient_summary = file.read()
    return patient_summary

def process_single_patient(patient_id, input_dir):
    # Read the patient's summary from the text file
    summary_text = read_summary_from_file(patient_id, input_dir)
    
    # Generate the summary of summaries and diagnosis
    final_summary = summary_of_summary(summary_text)
    
    return patient_id, final_summary

def process_multiple_patients_in_batches(patient_ids, input_dir, output_dir, group_size=10, start_group_index=0):
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)
    
    total_patients = len(patient_ids)
    num_groups = ceil(total_patients / group_size)
    
    for group_index in range(start_group_index, start_group_index+1):
        start_index = group_index * group_size
        end_index = min((group_index + 1) * group_size, total_patients)
        
        patient_summaries = {}
        batch_patient_ids = patient_ids[start_index:end_index]
        print(batch_patient_ids)
        with ThreadPoolExecutor(max_workers=16) as executor:
            future_to_patient = {executor.submit(process_single_patient, patient_id, input_dir): patient_id 
                                 for patient_id in batch_patient_ids}
            
            for future in as_completed(future_to_patient):
                patient_id, result = future.result()
                patient_summaries[patient_id] = result
        
        # Save the results of this batch to a JSON file
        json_filename = os.path.join(output_dir, f'patient_group_{group_index + 1}_summaries.json')
        with open(json_filename, 'w') as json_file:
            json.dump(patient_summaries, json_file, indent=4)
        
        print(f"Group {group_index + 1} processed and saved to {json_filename}")

input_dir = "../GPT/Results - sydronmic dx/Summaries of Notes"
output_dir = "../GPT/Results - sydronmic dx/GPT4o - Attempt 2"
patient_ids = df_note_grouped['PatientID'].tolist()
# Process all patients in batches and save results
process_multiple_patients_in_batches(patient_ids, input_dir=input_dir, output_dir=output_dir, group_size=20, start_group_index=21)


### Revision  - Logprobs

In [None]:
def summary_of_summary(summary):
    while True:
        try:
            response = client.chat.completions.create(
                model="gpt-4o-model2",
                messages = [
                    {"role": "system", "content": "You are a neurologist tasked with making syndromic diagnoses of cognitive impairmen using summaries of patient notes over a three-year period."},
                    {"role": "user", 
                    "content": f"""Here are the summaries in chronological order:
                    {summary}
                    Based on these summaries, provide an overall classification of the patient's cognitive health over the three-year period into one of the following syndromic diagnoses:
                    1. Normal
                    2. MCI 
                    3. Dementia
                    Provide the response with just one number of one of the three categories above here, without any additional text or formatting.
                    """}
                ],
                temperature = 0.1,
                logprobs=True,
                top_logprobs=2,
                max_tokens=1
            )
            # Extract the classification (e.g., "1", "2", or "3")
            predicted_class = response.choices[0].message.content.strip()
            
            # Extract top logprobs
            top_logprobs = response.choices[0].logprobs.content[0].top_logprobs
            
            # Format logprobs
            formatted_logprobs = [
                {
                    "token": logprob.token,
                    "logprob": logprob.logprob,
                    "probability_percent": np.round(np.exp(logprob.logprob)*100,2)
                }
                for logprob in top_logprobs
            ]
            
            return {
                "prediction": predicted_class,
                "logprobs": formatted_logprobs
            }
            # return response
        
        
        except Exception as e:
            print(f"An error occurred: {e}")
            print("Rate limit exceeded. Retrying in 60 seconds...")
            time.sleep(60)

In [None]:
def read_summary_from_file(patient_id, input_dir='summaries'):
    # Read the summary from the file
    summary_file_path = os.path.join(input_dir, f'{patient_id}_summaries.txt')
    with open(summary_file_path, 'r') as file:
        patient_summary = file.read()
    return patient_summary

def process_single_patient(patient_id, input_dir):
    # Read the patient's summary from the text file
    summary_text = read_summary_from_file(patient_id, input_dir)
    
    # Generate the summary of summaries and diagnosis
    final_summary = summary_of_summary(summary_text)
    
    return patient_id, final_summary

In [None]:
def process_multiple_patients_in_batches(patient_ids, input_dir, output_dir, group_size=10, start_group_index=0):
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)
    
    total_patients = len(patient_ids)
    
    num_groups = ceil(total_patients / group_size)
    for group_index in range(start_group_index, num_groups):
        start_index = group_index * group_size
        end_index = min((group_index + 1) * group_size, total_patients)
        print(f"Processing group {group_index + 1} of {num_groups}...")
        
        patient_summaries = {}
        batch_patient_ids = patient_ids[start_index:end_index]
        with ThreadPoolExecutor(max_workers=16) as executor:
            future_to_patient = {executor.submit(process_single_patient, patient_id, input_dir): patient_id 
                                for patient_id in batch_patient_ids}
            
            for future in as_completed(future_to_patient):
                patient_id, result = future.result()
                patient_summaries[patient_id] = result
        
        # Save the results of this batch to a JSON file
        json_filename = os.path.join(output_dir, f'patient_group_{group_index + 1}_logprobs.json')
        with open(json_filename, 'w') as json_file:
            json.dump(patient_summaries, json_file, indent=4)
        
        print(f"Group {group_index + 1} processed and saved to {json_filename}")

In [None]:
input_dir = "../GPT/Results - sydronmic dx/Summaries of Notes"
output_dir = "../GPT/Results - sydronmic dx/GPT4o - Attempt 5 - Revision"

patient_ids = df_note_grouped['PatientID'].tolist()
# Process all patients in batches and save results
process_multiple_patients_in_batches(patient_ids, input_dir=input_dir, output_dir=output_dir, group_size=20, start_group_index=0)

#### 5 Class

In [None]:
input_dir = "../GPT/Results - sydronmic dx/Summaries of Notes"
output_dir = "../GPT/Results - sydronmic dx/GPT4o - Attempt 8 - Revision"
patient_ids = df_note_grouped['PatientID'].tolist()
# Process all patients in batches and save results
process_multiple_patients_in_batches(patient_ids, input_dir=input_dir, output_dir=output_dir, group_size=20, start_group_index=0)

#### 5 Class logprob

In [None]:
input_dir = "../GPT/Results - sydronmic dx/Summaries of Notes"
output_dir = "../GPT/Results - sydronmic dx/GPT4o - Attempt 9 - Revision"

patient_ids = df_note_grouped['PatientID'].tolist()
# Process all patients in batches and save results
process_multiple_patients_in_batches(patient_ids, input_dir=input_dir, output_dir=output_dir, group_size=20, start_group_index=0)