In [2]:
import pandas as pd
import random

# Path to your Excel file
path = "./data/experiment/experiment.xlsx"

# Load the Excel file
with open(path, 'rb') as f:
    df = pd.read_excel(f)

# Set a random seed for reproducibility
random.seed(42)

# Iterate over each row and fill Random_1 to Random_4 with a random permutation
for idx in df.index:
    random_chain = random.sample([1, 2, 3, 4], 4)
    df.at[idx, 'Ground Truth'] = random_chain[0]
    df.at[idx, 'BioGPT'] = random_chain[1]
    df.at[idx, 'BioGPT-Large'] = random_chain[2]
    df.at[idx, 'BioGPT - ALL'] = random_chain[3]

df['Ground Truth'] = df['Ground Truth'].astype(int)
df['BioGPT'] = df['BioGPT'].astype(int)
df['BioGPT-Large'] = df['BioGPT-Large'].astype(int)
df['BioGPT - ALL'] = df['BioGPT - ALL'].astype(int)

df = df.set_index('Medical Reports')

In [3]:
# Load the Excel file
with open(path, 'rb') as f:
    df = pd.read_excel(f, index_col=0)

In [4]:
import json

path_biogpt = "./data/experiment/biogpt.json"
path_biogpt_large = "./data/experiment/biogpt_large2.json"
path_biogpt_all = "./data/experiment/biogpt-all.json"

with open(path_biogpt, 'r') as f:
    biogpt = json.load(f)

with open(path_biogpt_large, 'r') as f:
    biogpt_large = json.load(f)

with open(path_biogpt_all, 'r') as f:
    biogpt_all = json.load(f)

In [5]:
from typing import Tuple
import re

def filter_duplicate_sentences(string: str) -> Tuple[str, int]:
    sentence_split_pattern = r'(?<=[.?!])(?=\s(?![0-9]))|(?<=\n)'
    splitted_string = re.split(sentence_split_pattern, string)
    
    new_string = []
    for sentence in splitted_string:
        sentence = sentence.strip()
        if sentence not in new_string:
            new_string.append(sentence)
    
    return (' '.join(new_string), len(splitted_string) - len(new_string))

path_mapping = "./data/patient_info/report_id_specimen_map.json"
with open(path_mapping, 'r') as f:
    map_report_id_specimen = json.load(f)

results = {}
experiment_texts = {}
for medical_report in df.index:
    experiment_texts[medical_report] = {}
    results[medical_report] = {}
    specimen = map_report_id_specimen[medical_report]
    for reports in biogpt:
        if reports['image_id'] == specimen:
            caption, n_duplicates = filter_duplicate_sentences(reports['caption'])
            experiment_texts[medical_report]['Ground Truth'] = reports['gt_caption']
            experiment_texts[medical_report]['BioGPT'] = caption
            results[medical_report]['BioGPT'] = n_duplicates
    for reports in biogpt_large:
        if reports['image_id'] == specimen:
            caption, n_duplicates = filter_duplicate_sentences(reports['caption'])
            experiment_texts[medical_report]['BioGPT-Large'] = caption
            results[medical_report]['BioGPT-Large'] = n_duplicates
    for reports in biogpt_all:        
        if reports['image_id'] == specimen:
            caption, n_duplicates = filter_duplicate_sentences(reports['caption'])
            experiment_texts[medical_report]['BioGPT - ALL'] = caption
            results[medical_report]['BioGPT - ALL'] = n_duplicates

In [6]:
shuffled = {}
for medical_report, texts in experiment_texts.items():
    shuffled[medical_report] = ["", "", "", ""]
    
    # This dict looks like: 
    # {"Ground Truth": a, "BioGPT": b, "BioGPT-Large": c, "BioGPT - ALL": d}
    order = df.loc[medical_report].to_dict()
    for title, index in order.items():
        shuffled[medical_report][index-1] = f"Text {index} ({medical_report})\n" + texts[title]

In [None]:
for x, y in shuffled.items():
    print(f"Medical Report Number: {x}")
    for d in y:
        print(d)