In [136]:
import json
import re
import logging
import os
import openai
import pandas as pd 

from openai import OpenAI
from dotenv import load_dotenv


# Configuration


In [26]:
!pip install python-dotenv



In [61]:
credentials_path = 'credentials_nk.json'

with open(credentials_path, 'r') as file:
    credentials = json.load(file)

    
openai_api_key = credentials['openai_api_key']
base_url = credentials['base_url']

client = OpenAI(api_key=openai_api_key)







In [400]:
def generate_evaluation(filepath: str):
    data = clean_data(filepath)
    user_prompt = f"""\
Please evaluate the following summary that the assistant generated based on the following criteria and rate each on a scale of 1 to 5, where:
1 = Fails to meet basic standards, 2 = Imcomplete or unclear in multiple areas, 3 = Acceptable but has noticeable shortcomings, 4 = meeting expectations with minor issues, 5 = Exceeds expectations with no major flaws. If uncertain, do not give a score of 0. Choose the lowest score that applies.  
The criteria are:
1. **Completeness**: Does the summary cover all relevant aspects of the patient's condition, treatment, and follow-up care? Are there any important details missing?
2. **Coherence**: Is the summary logically organized and easy to understand? Does it follow a clear narrative flow?
3. **Adherence to Clinical Standards**: Does the summary meet professional medical standards in terms of language, format, and terminology? For example, does it use appropriate medical terminology and abbreviations correctly? 
4. **Compassion**: Does the summary convey empathy for the patient and reflect a caring attitude toward their recovery?

After each criterion, please provide a rating (1-5) and a brief explanation of your reasoning for the score. "Please return your response in JSON format with each criterion as a key, and the value as an object with 'score' and 'explanation'.



This is the discharge summmary letter: 
{data}
"""

    
    client = OpenAI(api_key=openai_api_key)
    completion = client.chat.completions.create(
        model="gpt-4o",
        messages=[
            {"role": "system", "content": "You are a medical expert who specializes in evaluating clinical summaries."},
            {"role": "user", "content": user_prompt}
        ],
    )

    response = completion.choices[0].message.content
    

    clean_response = response.replace("\n"," ").replace("```json", "").replace("```", "").strip()
    clean_response = re.sub(r"\s{2,}","", clean_response)

    try:
        data_dict = json.loads(clean_response)
    except json.JSONDecodeError as e:
        print(f"JSON Error: {e}")
    
      

    logging.info(f'<data>\n{filepath}\n</data>')
    #logging.info(f'<system>\n{system_prompt}\n</system>')
    logging.info(f'<user>\n{user_prompt}\n</user>')
    logging.info(f'<assistant>:\n{response}\n</assistant>')

    patient_number = "Patient #" + " " +  filepath[5]

    method = extract_method(filepath)

    
    return patient_number, method, data_dict


def clean_data(filepath: str) -> dict:
    with open(f'experiments/{filepath}', 'r') as file:
        data = json.load(file)
        letter = data['assistant']
    return letter #only retrieving the summary

def extract_method(string):
    return string[7:-7]
    # match = re.search(r"(baseline|instruction|instruction_example)", string)
    # if match:
    #     return match.group(0)
    # else:
    #     return None
    
generate_evaluation(filepath)


('Patient # 4',
 'baseline',
 {'Completeness': {'score': 4,
   'explanation': "The summary is mostly complete, detailing the patient's admission, diagnosis, treatment, and discharge instructions. It includes relevant details about symptoms, diagnostic findings, treatment approach, and follow-up care. However, it lacks information on any specific patient education provided at discharge or any lifestyle modifications recommended."},
  'Coherence': {'score': 5,
   'explanation': 'The summary is logically organized and follows a clear narrative flow from admission through treatment and discharge instructions. It is easy to understand and presents information in a structured manner.'},
  'Adherence to Clinical Standards': {'score': 4,
   'explanation': 'The summary uses appropriate medical terminology and follows a professional format. There is a slight issue with the use of abbreviations without initial definitions (e.g., CP for chest pain and SoB for shortness of breath), as they are not 

# Evaluating through all the baseline experiments (20)

In [374]:
m

'i'

In [380]:
os.makedirs('evaluations', exist_ok=True)
# prompt_method= ['baseline', 'instruction', 'instruction_example']
for i in range(1, 5):
    for j in range (1, 6):

        filepath = f'data_{i}_instruction_example_{j}.json'
        patient_number, method, data_dict = generate_evaluation(filepath)

        print(data_dict)


        output = f'evaluations/data_{i}_instruction_example_{j}.json'
        with open(output, 'w', encoding='utf-8') as json_file:
                json.dump(data_dict, json_file, indent=4)
        # filepath = f'data_{i}_baseline_{j}.json'
        # patient_number, method, data_dict = generate_evaluation(filepath)

        # print(data_dict)


        # output = f'evaluations/data_{i}_{method}_{j}.json'
        # with open(output, 'w', encoding='utf-8') as json_file:
        #     json.dump(data_dict, json_file, indent=4)
#mean and standard deviation for each method

{'Completeness': {'score': 4, 'explanation': "The summary covers the key aspects of the patient's condition, treatment, and follow-up care quite thoroughly. It discusses the diagnosis, initial presentation, treatment adjustments, and discharge instructions. However, more details about the patient's history, any underlying conditions, or potential risk factors for pneumonia could have been included for a higher score."}, 'Coherence': {'score': 5, 'explanation': "The summary is logically organized and follows a clear narrative flow from admission to discharge. It is easy to understand, with well-structured sentences and a chronological presentation of the patient's hospital stay and treatment progression."}, 'Adherence to Clinical Standards': {'score': 5, 'explanation': 'The summary uses appropriate medical terminology and follows a professional format. Abbreviations and medical terms are used correctly, and clinical data is presented in a clear manner that meets professional standards.'

In [381]:
os.makedirs('evaluations', exist_ok=True)
# prompt_method= ['baseline', 'instruction', 'instruction_example']
for i in range(1, 5):
    for j in range (1, 6):

        filepath = f'data_{i}_instruction_{j}.json'
        patient_number, method, data_dict = generate_evaluation(filepath)

        print(data_dict)


        output = f'evaluations/data_{i}_instruction_{j}.json'
        with open(output, 'w', encoding='utf-8') as json_file:
                json.dump(data_dict, json_file, indent=4)

#mean and standard deviation for each method

{'Completeness': {'score': 4, 'explanation': "The summary overall documents the patient's symptoms, diagnosis, treatment plan, response to therapy, and discharge instructions well. It includes detailed observations of clinical improvements and outlines the follow-up plan. However, it lacks details on the unspecified organism, which could be relevant if further investigation or notification to the patient is needed post-discharge."}, 'Coherence': {'score': 5, 'explanation': "The summary is logically structured, following a clear timeline from admission through discharge. Each section flows naturally into the next, making it easy to follow the patient's progress and understand the medical decisions made. The narrative is easy to comprehend and professionally presented."}, 'Adherence to Clinical Standards': {'score': 5, 'explanation': 'The summary uses appropriate medical terminology and correctly follows the format expected in a discharge summary. Medical abbreviations are clear, and the

In [382]:
os.makedirs('evaluations', exist_ok=True)
# prompt_method= ['baseline', 'instruction', 'instruction_example']
for i in range(1, 5):
    for j in range (1, 6):

        filepath = f'data_{i}_baseline_{j}.json'
        patient_number, method, data_dict = generate_evaluation(filepath)

        print(data_dict)


        output = f'evaluations/data_{i}_baseline_{j}.json'
        with open(output, 'w', encoding='utf-8') as json_file:
                json.dump(data_dict, json_file, indent=4)

#mean and standard deviation for each method

{'Completeness': {'score': 4, 'explanation': "The summary effectively covers the patient's condition, treatment, and follow-up care. However, it lacks specific details on the patient's past medical history, any allergies, or potential side effects of the prescribed medications, which could be important for comprehensive understanding."}, 'Coherence': {'score': 5, 'explanation': "The summary is logically organized and easy to follow. It presents the information in a clear narrative flow from admission to discharge, detailing the patient's progress effectively."}, 'Adherence to Clinical Standards': {'score': 5, 'explanation': 'The summary uses appropriate medical terminology and adheres to professional standards. The language is precise, and medical abbreviations are used correctly, such as CRP and oxygen saturation levels.'}, 'Compassion': {'score': 4, 'explanation': "The summary conveys a caring attitude toward the patient's recovery, particularly in the closing paragraphs. However, in

In [361]:
os.getcwd()

'c:\\Users\\nkann\\OneDrive\\Documents\\GitHub\\Discharge-Prototype'

In [384]:
eval_files = [filename for filename in os.listdir('evaluations') if filename.endswith('.json')]

In [385]:
eval_files

['data_1_baseline_1.json',
 'data_1_baseline_2.json',
 'data_1_baseline_3.json',
 'data_1_baseline_4.json',
 'data_1_baseline_5.json',
 'data_1_instruction_1.json',
 'data_1_instruction_2.json',
 'data_1_instruction_3.json',
 'data_1_instruction_4.json',
 'data_1_instruction_5.json',
 'data_1_instruction_example_1.json',
 'data_1_instruction_example_2.json',
 'data_1_instruction_example_3.json',
 'data_1_instruction_example_4.json',
 'data_1_instruction_example_5.json',
 'data_2_baseline_1.json',
 'data_2_baseline_2.json',
 'data_2_baseline_3.json',
 'data_2_baseline_4.json',
 'data_2_baseline_5.json',
 'data_2_instruction_1.json',
 'data_2_instruction_2.json',
 'data_2_instruction_3.json',
 'data_2_instruction_4.json',
 'data_2_instruction_5.json',
 'data_2_instruction_example_1.json',
 'data_2_instruction_example_2.json',
 'data_2_instruction_example_3.json',
 'data_2_instruction_example_4.json',
 'data_2_instruction_example_5.json',
 'data_3_baseline_1.json',
 'data_3_baseline_2.jso

In [401]:
evaluations = []
for file in eval_files:
    with open(f'evaluations\{file}', 'r') as json_file:
        data_dict = json.load(json_file)
        patient_number = "Patient #" + " " +  file[5]
        method = extract_method(file)

        Completeness_score = int(data_dict.get('Completeness', {}).get('score')) 
        Coherence_score = int(data_dict.get('Coherence', {}).get('score')) 
        Ad_score = int(data_dict.get('Adherence to Clinical Standards', {}).get('score')) 
        # print(Adherence_score)
        Compassion_score = int(data_dict.get('Compassion', {}).get('score')) 

        Total_score = Completeness_score + Coherence_score + Ad_score + Compassion_score

              
        evaluations.append((
            patient_number, 
            method, 
            Completeness_score, 
            Coherence_score,
            Ad_score,
            Compassion_score, 
            Total_score ))


df_b = pd.DataFrame(evaluations, columns=['Patient', 'Method' ,'Completeness Score', 'Coherence Score', 'Adherence to Clinical Standards Score', 'Compassion Score', 'Total Score'])
df_b.dropna(subset=['Patient', 'Method'], inplace=True)

  with open(f'evaluations\{file}', 'r') as json_file:


In [402]:
df_b.to_csv('evaluations/evaluations.csv', index=False)

In [403]:
df_b.drop(columns = ['Patient']).groupby('Method').agg(['mean', 'std']).reset_index()

Unnamed: 0_level_0,Method,Completeness Score,Completeness Score,Coherence Score,Coherence Score,Adherence to Clinical Standards Score,Adherence to Clinical Standards Score,Compassion Score,Compassion Score,Total Score,Total Score
Unnamed: 0_level_1,Unnamed: 1_level_1,mean,std,mean,std,mean,std,mean,std,mean,std
0,baseline,4.05,0.759155,4.75,0.444262,4.7,0.470162,4.0,0.725476,17.5,1.732051
1,instruction,4.4,0.502625,4.85,0.366348,4.95,0.223607,3.8,0.410391,18.0,0.973329
2,instruction_example,4.3,0.571241,4.75,0.444262,4.85,0.366348,3.75,0.444262,17.65,1.268028


In [404]:
df_b['Method'].value_counts()

Method
baseline               20
instruction            20
instruction_example    20
Name: count, dtype: int64

# Evaluating through Instruction (20)

In [326]:
#os.makedirs('evaluations', exist_ok=True)
evaluations = []
for i in range(1, 5):
    for j in range (1, 6):
        filepath = f'data_{i}_instruction_{j}.json'
        patient_number, method, data_dict = generate_evaluation(filepath)

       
        #output = f'evaluations/data_{i}_{method}_{j}.json'
        #ith open(output, 'w', encoding='utf-8') as json_file:
            #json.dump(data_dict, json_file, indent=4)

        Completeness_score = int(data_dict.get('Completeness', {}).get('score', 0)) 
        Coherence_score = int(data_dict.get('Coherence', {}).get('score', 0)) 
        Adherence_score = int(data_dict.get('Adherence', {}).get('score', 0)) 
        Compassion_score = int(data_dict.get('Compassion', {}).get('score', 0))

        Total_score = Completeness_score + Coherence_score + Adherence_score + Compassion_score

        
              
        evaluations.append((
            patient_number, 
            method, 
            Completeness_score, 
            Coherence_score,
            Adherence_score,
            Compassion_score, 
            Total_score ))
        
        


df_i = pd.DataFrame(evaluations, columns=['Patient', 'Method' ,'Completeness Score', 'Coherence Score', 'Adherence to Clinical Standards Score', 'Compassion Score', 'Total Score'])
df_i.dropna(subset=['Patient', 'Method'], inplace=True)

print(df_i.head())


       Patient       Method  Completeness Score  Coherence Score  \
0  Patient # 1  instruction                   4                5   
1  Patient # 1  instruction                   5                5   
2  Patient # 1  instruction                   5                5   
3  Patient # 1  instruction                   5                5   
4  Patient # 1  instruction                   5                5   

   Adherence to Clinical Standards Score  Compassion Score  Total Score  
0                                      0                 4           13  
1                                      0                 4           14  
2                                      0                 4           14  
3                                      0                 4           14  
4                                      0                 4           14  


In [372]:
(df_i['Adherence to Clinical Standards Score'] == 0).sum()

20

# Evaluate instruction with example (20)

In [320]:
os.makedirs('evaluations', exist_ok=True)
evaluations = []
for i in range(1, 5):
    for j in range (1, 6):
        filepath = f'data_{i}_instruction_example_{j}.json'
        patient_number, method, data_dict = generate_evaluation(filepath)

       
        output = f'evaluations/data_{i}_instruction_example_{j}.json'
        with open(output, 'w', encoding='utf-8') as json_file:
            json.dump(data_dict, json_file, indent=4)

        Completeness_score = int(data_dict.get('Completeness', {}).get('score', 0)) 
        Coherence_score = int(data_dict.get('Coherence', {}).get('score', 0)) 
        Adherence_score = int(data_dict.get('Adherence', {}).get('score', 0)) 
        Compassion_score = int(data_dict.get('Compassion', {}).get('score', 0))

        Total_score = Completeness_score + Coherence_score + Adherence_score + Compassion_score

        
              
        evaluations.append((
            patient_number, 
            method, 
            Completeness_score, 
            Coherence_score,
            Adherence_score,
            Compassion_score, 
            Total_score ))
        #with open(output, 'w') as outfile:
        


df_ie = pd.DataFrame(evaluations, columns=['Patient', 'Method' ,'Completeness Score', 'Coherence Score', 'Adherence to Clinical Standards Score', 'Compassion Score', 'Total Score'])
df_ie.dropna(subset=['Patient', 'Method'], inplace=True)

print(df_ie.head())

       Patient       Method  Completeness Score  Coherence Score  \
0  Patient # 1  instruction                   4                4   
1  Patient # 1  instruction                   4                5   
2  Patient # 1  instruction                   4                5   
3  Patient # 1  instruction                   3                4   
4  Patient # 1  instruction                   4                5   

   Adherence to Clinical Standards Score  Compassion Score  Total Score  
0                                      0                 3           11  
1                                      0                 4           13  
2                                      0                 4           13  
3                                      0                 3           10  
4                                      0                 3           12  


# Combing the dataframes into one

In [321]:
complete_df = pd.concat([df_b, df_i, df_ie], ignore_index=True)
complete_df.to_csv('evaluations/complete_df.csv', index=False)