## Dependencies

In [1]:
import os
import sys

sys.path.append(os.path.join(os.getcwd(), '..'))


In [22]:
%load_ext autoreload
%autoreload 2

import json
import tempfile
from functools import partial

import pandas as pd
from loguru import logger

from medbench.aml import AzureML

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


## Load data

### `mtsamples-gpt4-summarization`

There are two jsonl files for this dataset, but both of them have a lot of overlap. So for the purpose of this notebook, we will merge both files into a single dataframe.

In [3]:
aml = AzureML.connect_to_registry(registry_name="azureml-1p")

In [4]:
rounding_output = "rounding_output.jsonl"
output_clean = "output-clean-gpt4.jsonl"

In [5]:
mtsamples_readings_output_df = aml.get_dataset(
    name="mtsamples-gpt4-summarization", version="latest", read_folder_jsonl=True, target_jsonl=rounding_output
)

print(mtsamples_readings_output_df.shape)
mtsamples_readings_output_df.head()

2024-10-31 16:29:00.851 | DEBUG    | medbench.aml:get_dataset:130 - Downloading rounding_output.jsonl to temporary directory `/tmp/tmpnl55mwc_`...
2024-10-31 16:29:05.902 | INFO     | medbench.aml:get_dataset:135 - Reading rounding_output.jsonl as a pandas DataFrame.


(1252, 6)


  return pd.read_json(jsonl_str, lines=True)


Unnamed: 0,Keywords,Description,Filename,Clinical_Note,Medical_Specialty,output_gpt-4
0,"pediatrics - neonatal, 1-year-old, naps, mama,...",Health maintenance exam for 1-year-old female.,1-year-old Exam - H&P.txt,CHIEF COMPLAINT: This 1-year-old female presen...,Pediatrics - Neonatal,"{\n ""History"": ""This 1-year-old female patien..."
1,"radiology, 2-d study, doppler, tricuspid regur...","Normal left ventricle, moderate biatrial enlar...",2-D Doppler.txt,"2-D STUDY1. Mild aortic stenosis, widely calci...",Radiology,"{\n ""History"": ""The patient's chart does not ..."
2,"radiology, 2-d m-mode, doppler, aortic valve, ...",2-D M-Mode. Doppler.,2-D Echocardiogram - 1.txt,2-D M-MODE: 1. Left atrial enlargement with l...,Radiology,"{\n ""History"": ""The patient's chart does not ..."
3,"radiology, 2-d, doppler, echocardiogram, annul...",2-D Echocardiogram,2-D Echocardiogram - 2.txt,COMMENTS:1. The left ventricular cavity size a...,Radiology,"{\n ""History"": ""The patient's chart does not ..."
4,"radiology, 2-d echocardiogram, cardiac functio...",2-D Echocardiogram,2-D Echocardiogram - 3.txt,2-D ECHOCARDIOGRAMMultiple views of the heart ...,Radiology,"{\n ""History"": ""No relevant medical history, ..."


In [6]:
mtsamples_clean_output_df = aml.get_dataset(
    name="mtsamples-gpt4-summarization", version="latest", read_folder_jsonl=True, target_jsonl=output_clean
)

print(mtsamples_clean_output_df.shape)
mtsamples_clean_output_df.head()

2024-10-31 16:29:07.230 | DEBUG    | medbench.aml:get_dataset:130 - Downloading output-clean-gpt4.jsonl to temporary directory `/tmp/tmp1d28lp34`...
2024-10-31 16:29:09.734 | INFO     | medbench.aml:get_dataset:135 - Reading output-clean-gpt4.jsonl as a pandas DataFrame.


(1255, 6)


  return pd.read_json(jsonl_str, lines=True)


Unnamed: 0,Keywords,Description,Filename,Clinical_Note,Medical_Specialty,output_gpt-4
0,"pediatrics - neonatal, 1-year-old, naps, mama...",Health maintenance exam for 1-year-old female.,1-year-old Exam - H&P.txt,CHIEF COMPLAINT: This 1-year-old female presen...,Pediatrics - Neonatal,"{\n""indication"": ""Routine well child care, Acu..."
1,"radiology, 2-d study, doppler, tricuspid regu...","Normal left ventricle, moderate biatrial enla...",2-D Doppler.txt,"2-D STUDY1. Mild aortic stenosis, widely calci...",Radiology,"{\n""indication"": ""Mild aortic stenosis, Mild l..."
2,"radiology, 2-d m-mode, doppler, aortic valve,...",2-D M-Mode. Doppler.,2-D Echocardiogram - 1.txt,2-D M-MODE: 1. Left atrial enlargement with l...,Radiology,"{\n""indication"": ""Left atrial enlargement, Mil..."
3,"radiology, 2-d, doppler, echocardiogram, annu...",2-D Echocardiogram,2-D Echocardiogram - 2.txt,COMMENTS:1. The left ventricular cavity size a...,Radiology,"{\n""indication"": ""Cardiac evaluation"",\n""indic..."
4,"radiology, 2-d echocardiogram, cardiac functi...",2-D Echocardiogram,2-D Echocardiogram - 3.txt,2-D ECHOCARDIOGRAMMultiple views of the heart ...,Radiology,"{\n""indication"": ""Routine check-up"",\n""indicat..."


To ensure more matches, we clean the texts before merging

In [7]:
for col in mtsamples_clean_output_df.columns:
    mtsamples_clean_output_df[col] = mtsamples_clean_output_df[col].str.strip()
    mtsamples_readings_output_df[col] = mtsamples_readings_output_df[col].str.strip()

In [8]:
mt_samples_summarization = pd.merge(
    mtsamples_readings_output_df,
    mtsamples_clean_output_df,
    on=["Keywords", "Description", "Filename", "Medical_Specialty"],
    how="outer",
    suffixes=("_rounding", "_clean"),
)

print(mt_samples_summarization.shape)
mt_samples_summarization.head()

(1266, 8)


Unnamed: 0,Keywords,Description,Filename,Clinical_Note_rounding,Medical_Specialty,output_gpt-4_rounding,Clinical_Note_clean,output_gpt-4_clean
0,"allergy / immunology, allergic rhinitis, aller...",A 23-year-old white female presents with compl...,Allergic Rhinitis.txt,SUBJECTIVE: This 23-year-old white female pres...,Allergy / Immunology,"{\n ""History"": ""The patient is a 23-year-old ...",SUBJECTIVE: This 23-year-old white female pres...,"{\n""indication"": ""Allergic rhinitis"",\n""indica..."
1,"allergy / immunology, asthma, complete physica...",A female for a complete physical and follow up...,Followup on Asthma.txt,SUBJECTIVE: This is a 42-year-old white femal...,Allergy / Immunology,"{\n ""History"": ""The patient is a 42-year-old ...",SUBJECTIVE: This is a 42-year-old white femal...,"{\n""indication"": ""Asthma exacerbation, Allergi..."
2,"allergy / immunology, chronic glossitis, xeros...","Chronic glossitis, xerostomia, probable enviro...",Evaluation of Allergies.txt,HISTORY: A 55-year-old female presents self-re...,Allergy / Immunology,"{\n ""History"": ""The patient is a 55-year-old ...",HISTORY: A 55-year-old female presents self-re...,"{\n""indication"": ""Evaluation and treatment of ..."
3,"allergy / immunology, keflex, acute allergic r...","Acute allergic reaction, etiology uncertain, h...",Allergy Evaluation Consult.txt,HISTORY: A 34-year-old male presents today se...,Allergy / Immunology,"{\n ""History"": ""The patient is a 34-year-old ...",HISTORY: A 34-year-old male presents today se...,"{\n""indication"": ""Acute allergic reaction, Ren..."
4,"allergy / immunology, sinusitis, cephalgia, si...",Functional endoscopic sinus surgery with left ...,Ethmoidectomy and Mastoid Antrostomy.txt,PREOPERATIVE DIAGNOSES:1. Chronic sinusitis.2....,Allergy / Immunology,"{\n ""History"": ""The patient is a 50-year-old ...",PREOPERATIVE DIAGNOSES:1. Chronic sinusitis.2....,"{\n""indication"": ""Chronic sinusitis and sinus ..."


Note we do not merge over `Clinical_Note`. The reason is that one single entry has a different value despite all other columns' values being the same:

In [9]:
not_nan = mt_samples_summarization[
    (mt_samples_summarization["Clinical_Note_rounding"].notna())
    & (mt_samples_summarization["Clinical_Note_clean"].notna())
]

not_nan[not_nan["Clinical_Note_rounding"] != not_nan["Clinical_Note_clean"]]

Unnamed: 0,Keywords,Description,Filename,Clinical_Note_rounding,Medical_Specialty,output_gpt-4_rounding,Clinical_Note_clean,output_gpt-4_clean
1083,"pediatrics - neonatal, 1-year-old, naps, mama,...",Health maintenance exam for 1-year-old female.,1-year-old Exam - H&P.txt,CHIEF COMPLAINT: This 1-year-old female presen...,Pediatrics - Neonatal,"{\n ""History"": ""This 1-year-old female patien...",CHIEF COMPLAINT: This 1-year-old female presen...,"{\n""indication"": ""Routine well child care, Acu..."


The difference is only a punctuation mark, and for that reason we take the clinical note from the `output-clean-gpt4.jsonl` file:

In [10]:
def select_clinical_note(
    row,
    clinical_note_columns: list[str] = [
        "Clinical_Note_rounding",
        "Clinical_Note_clean",
    ],
    priority_clinical_note="Clinical_Note_clean",
):
    clinical_note_columns_isna = [
        isinstance(row[col], float) for col in clinical_note_columns
    ]

    if all(clinical_note_columns_isna):
        raise ValueError("Both clinical notes are missing")

    if clinical_note_columns_isna[0]:
        return row[clinical_note_columns[1]]
    elif clinical_note_columns_isna[1]:
        return row[clinical_note_columns[0]]
    else:
        return row[priority_clinical_note]


mt_samples_summarization["Clinical_Note"] = mt_samples_summarization.apply(
    select_clinical_note, axis=1
)

mt_samples_summarization.shape

(1266, 9)

In [11]:
mt_samples_summarization.drop(columns=["Clinical_Note_rounding", "Clinical_Note_clean"], inplace=True)

mt_samples_summarization.shape


(1266, 7)

In [12]:
mt_samples_summarization.fillna("", inplace=True)

In [13]:
mt_samples_summarization

Unnamed: 0,Keywords,Description,Filename,Medical_Specialty,output_gpt-4_rounding,output_gpt-4_clean,Clinical_Note
0,"allergy / immunology, allergic rhinitis, aller...",A 23-year-old white female presents with compl...,Allergic Rhinitis.txt,Allergy / Immunology,"{\n ""History"": ""The patient is a 23-year-old ...","{\n""indication"": ""Allergic rhinitis"",\n""indica...",SUBJECTIVE: This 23-year-old white female pres...
1,"allergy / immunology, asthma, complete physica...",A female for a complete physical and follow up...,Followup on Asthma.txt,Allergy / Immunology,"{\n ""History"": ""The patient is a 42-year-old ...","{\n""indication"": ""Asthma exacerbation, Allergi...",SUBJECTIVE: This is a 42-year-old white femal...
2,"allergy / immunology, chronic glossitis, xeros...","Chronic glossitis, xerostomia, probable enviro...",Evaluation of Allergies.txt,Allergy / Immunology,"{\n ""History"": ""The patient is a 55-year-old ...","{\n""indication"": ""Evaluation and treatment of ...",HISTORY: A 55-year-old female presents self-re...
3,"allergy / immunology, keflex, acute allergic r...","Acute allergic reaction, etiology uncertain, h...",Allergy Evaluation Consult.txt,Allergy / Immunology,"{\n ""History"": ""The patient is a 34-year-old ...","{\n""indication"": ""Acute allergic reaction, Ren...",HISTORY: A 34-year-old male presents today se...
4,"allergy / immunology, sinusitis, cephalgia, si...",Functional endoscopic sinus surgery with left ...,Ethmoidectomy and Mastoid Antrostomy.txt,Allergy / Immunology,"{\n ""History"": ""The patient is a 50-year-old ...","{\n""indication"": ""Chronic sinusitis and sinus ...",PREOPERATIVE DIAGNOSES:1. Chronic sinusitis.2....
...,...,...,...,...,...,...,...
1261,"radiology, silver chloride biopotential electr...",Abnormal electronystagmogram demonstrating pro...,Electronystagmogram.txt,Radiology,"{\n ""History"": ""No relevant medical history, ...","{\n""indication"": ""Vestibular Dysfunction"",\n""i...",PROCEDURE: This tracing was obtained utilizing...
1262,"radiology, supraclavicular, cervical adenopath...",A 68-year-old white male with recently diagnos...,CT Chest - 2.txt,Radiology,"{\n ""History"": ""The patient is a 68-year-old ...","{\n""indication"": ""Adenocarcinoma"",\n""indicatio...",CLINICAL HISTORY: A 68-year-old white male wi...
1263,"radiology, transcatheter infusion of papaverin...",Diagnostic cerebral angiogram and transcathete...,Diagnostic Cerebral Angiogram.txt,Radiology,"{\n ""History"": ""The patient is a 13-year-old ...","{\n""indication"": ""Postoperative check angiogra...",EXAM:1. Diagnostic cerebral angiogram.2. Trans...
1264,"radiology, tumor cells, concomitant chemoradio...",Concomitant chemoradiotherapy for curative int...,Concomitant Chemoradiotherapy.txt,Radiology,,"{\n""indication"": ""Cancer"",\n""indication_icd10""...",CONCOMITANT CHEMORADIOTHERAPY FOR CURATIVE INT...


#### Process gpt-4 outputs

In [14]:
def try_json_loads(value: str):
    try:
        return json.loads(value)
    except json.JSONDecodeError:
        return {}
    
mt_samples_summarization["output_gpt-4_rounding"] = mt_samples_summarization["output_gpt-4_rounding"].apply(try_json_loads)
mt_samples_summarization["output_gpt-4_clean"] = mt_samples_summarization["output_gpt-4_clean"].apply(try_json_loads)

In [15]:
rounding_keys = set()
clean_keys = set()

# jsons can have different keys, find all possibilities:
for i, row in mt_samples_summarization.iterrows():
    rounding_keys.update(row["output_gpt-4_rounding"].keys())
    clean_keys.update(row["output_gpt-4_clean"].keys())

len(rounding_keys), len(clean_keys)

(11, 11)

In [16]:
target_source_output_key_map = {}

rounding_keys_map = {key.lower(): key for key in rounding_keys}
clean_keys_map = {key.lower(): key for key in clean_keys}

for clean_key, key in clean_keys_map.items():
    target_source_output_key_map[clean_key] = {"output_gpt-4_clean": key}

for clean_key, key in rounding_keys_map.items():
    if clean_key not in target_source_output_key_map:
        target_source_output_key_map[clean_key] = {}

    target_source_output_key_map[clean_key]["output_gpt-4_rounding"] = key

target_source_output_key_map

{'findings': {'output_gpt-4_clean': 'findings',
  'output_gpt-4_rounding': 'Findings'},
 'diagnosis_icd10': {'output_gpt-4_clean': 'diagnosis_icd10',
  'output_gpt-4_rounding': 'diagnosis_icd10'},
 'nurse': {'output_gpt-4_clean': 'nurse'},
 'indication_icd10': {'output_gpt-4_clean': 'indication_icd10',
  'output_gpt-4_rounding': 'indication_icd10'},
 'patient': {'output_gpt-4_clean': 'patient'},
 'loinc': {'output_gpt-4_clean': 'LOINC', 'output_gpt-4_rounding': 'LOINC'},
 'indication': {'output_gpt-4_clean': 'indication',
  'output_gpt-4_rounding': 'indication'},
 'pharmacy': {'output_gpt-4_clean': 'pharmacy'},
 'results': {'output_gpt-4_clean': 'results',
  'output_gpt-4_rounding': 'results'},
 'rxnorm': {'output_gpt-4_clean': 'RxNorm', 'output_gpt-4_rounding': 'RxNorm'},
 'summary': {'output_gpt-4_clean': 'summary'},
 'history': {'output_gpt-4_rounding': 'History'},
 'plan': {'output_gpt-4_rounding': 'Plan'},
 'cpt': {'output_gpt-4_rounding': 'CPT'},
 'currentcondition': {'output_gpt

In [17]:
OutputKeyMapping = dict[str, dict[str, str]]

def explode_gpt4_outputs(
    row,
    output_key_mapping: OutputKeyMapping,
    # gpt4_output_columns: list[str] = ["output_gpt-4_rounding", "output_gpt-4_clean"],
):
    mapped_outputs = {}
    for target_output_key_suffix, output_key_map in output_key_mapping.items():
        for gpt4_key, source_output_key in output_key_map.items():
            if gpt4_key in row and source_output_key in row[gpt4_key]:
                mapped_outputs[f"{gpt4_key}_{target_output_key_suffix}"] = row[gpt4_key][source_output_key]

    return mapped_outputs

mt_samples_summarization["processed_gpt4_outputs"] = mt_samples_summarization.apply(explode_gpt4_outputs, axis=1, output_key_mapping=target_source_output_key_map)
mt_samples_summarization.shape

(1266, 8)

In [18]:
combined_keys = set()

# jsons can have different keys, find all possibilities:
for i, row in mt_samples_summarization.iterrows():
    combined_keys.update(row["processed_gpt4_outputs"].keys())

len(combined_keys)

22

In [19]:
gpt4_output_df = pd.json_normalize(mt_samples_summarization["processed_gpt4_outputs"], max_level=0)
gpt4_output_df.shape

(1266, 22)

In [20]:
# Combine two dataframes
mt_samples_summarization = pd.concat([mt_samples_summarization, gpt4_output_df], axis=1)
mt_samples_summarization.shape

(1266, 30)

In [21]:
mt_samples_summarization.head()

Unnamed: 0,Keywords,Description,Filename,Medical_Specialty,output_gpt-4_rounding,output_gpt-4_clean,Clinical_Note,processed_gpt4_outputs,output_gpt-4_clean_findings,output_gpt-4_rounding_findings,...,output_gpt-4_clean_pharmacy,output_gpt-4_clean_rxnorm,output_gpt-4_rounding_rxnorm,output_gpt-4_clean_summary,output_gpt-4_rounding_history,output_gpt-4_rounding_plan,output_gpt-4_rounding_cpt,output_gpt-4_rounding_currentcondition,output_gpt-4_clean_results,output_gpt-4_rounding_results
0,"allergy / immunology, allergic rhinitis, aller...",A 23-year-old white female presents with compl...,Allergic Rhinitis.txt,Allergy / Immunology,{'History': 'The patient is a 23-year-old whit...,"{'indication': 'Allergic rhinitis', 'indicatio...",SUBJECTIVE: This 23-year-old white female pres...,{'output_gpt-4_clean_findings': 'Patient prese...,"Patient presents with allergies, worse than wh...",The patient's symptoms and physical examinatio...,...,Dispense Zyrtec and Nasonex as prescribed. Che...,"[20610, 56946, 314076]","[{'code': '20610', 'description': 'Zyrtec'}, {...",23-year-old female with history of allergies a...,The patient is a 23-year-old white female with...,The patient will switch from Allegra to Zyrtec...,[],The patient presents with complaints of worsen...,,
1,"allergy / immunology, asthma, complete physica...",A female for a complete physical and follow up...,Followup on Asthma.txt,Allergy / Immunology,{'History': 'The patient is a 42-year-old whit...,"{'indication': 'Asthma exacerbation, Allergic ...",SUBJECTIVE: This is a 42-year-old white femal...,{'output_gpt-4_clean_findings': 'Patient's ast...,"Patient's asthma has worsened, requiring daily...",The patient's recent lab work and cholesterol ...,...,"Dispense Flovent 44 mcg, Allegra, Flonase, and...","[688242, 57052, 32968, 20610, 860974]","[{'code': '4337', 'description': 'Proventil'},...",42-year-old female with worsening asthma and a...,The patient is a 42-year-old white female with...,The patient's current problems include worseni...,"[{'code': '99214', 'description': 'Office or o...",The patient's asthma has worsened over the las...,,
2,"allergy / immunology, chronic glossitis, xeros...","Chronic glossitis, xerostomia, probable enviro...",Evaluation of Allergies.txt,Allergy / Immunology,{'History': 'The patient is a 55-year-old fema...,{'indication': 'Evaluation and treatment of al...,HISTORY: A 55-year-old female presents self-re...,{'output_gpt-4_clean_findings': 'Patient has a...,"Patient has a history of fibromyalgia, peptic ...",The patient was referred for RAST allergy test...,...,No changes in her medication were prescribed u...,[],"[{'code': 'Not provided', 'description': 'Not ...",55-year-old female presents for evaluation and...,The patient is a 55-year-old female with a his...,The patient will follow up after RAST allergy ...,"[{'code': 'Not provided', 'description': 'Not ...",The patient presents with symptoms of allergie...,,
3,"allergy / immunology, keflex, acute allergic r...","Acute allergic reaction, etiology uncertain, h...",Allergy Evaluation Consult.txt,Allergy / Immunology,{'History': 'The patient is a 34-year-old male...,"{'indication': 'Acute allergic reaction, Renal...",HISTORY: A 34-year-old male presents today se...,{'output_gpt-4_clean_findings': 'Patient had a...,"Patient had an acute allergic reaction, possib...","The patient had an acute allergic reaction, su...",...,Ensure patient is not prescribed cephalosporin...,"[1516, 6809, 104959, 20610]","[{'code': '159', 'description': 'Atenolol'}, {...",34-year-old male with a history of renal failu...,The patient is a 34-year-old male with a histo...,The patient's main problems are acute allergic...,"[{'code': '86003', 'description': 'Allergen sp...","The patient is currently stable, alert, and or...",,
4,"allergy / immunology, sinusitis, cephalgia, si...",Functional endoscopic sinus surgery with left ...,Ethmoidectomy and Mastoid Antrostomy.txt,Allergy / Immunology,{'History': 'The patient is a 50-year-old fema...,{'indication': 'Chronic sinusitis and sinus ce...,PREOPERATIVE DIAGNOSES:1. Chronic sinusitis.2....,{'output_gpt-4_clean_findings': 'Chronic infla...,Chronic inflammation within the left maxillary...,Functional endoscopic sinus surgery with left ...,...,Ensure availability of 1% lidocaine with 1:100...,[3498],[],50-year-old female with a history of chronic l...,The patient is a 50-year-old female with a his...,The patient's main problems are chronic sinusi...,"[{'code': '31254', 'description': 'Nasal/sinus...",The patient was stable during the procedure. S...,,


## Save dataset

In [23]:
with tempfile.TemporaryDirectory() as temp_dir_name:
    mt_samples_summarization.to_json(
        os.path.join(temp_dir_name, "unified.jsonl"), orient="records", lines=True
    )

    aml.register_folder_as_dataset(
        folder_path=temp_dir_name,
        dataset_name="mtsamples-gpt4-summarization",
        dataset_description=(
            "GPT-4 summaries from mtsamples dataset.\n\n"
            "Dataset created by running GPT-4 on the mtsamples dataset to generate summaries for several specialties.\n\n"
            "This version of the dataset exposes all model outputs directly as columns. It is otherwise identical to the previous version."
        )
    )

Subtype value SAS has no mapping, use base class DataReferenceCredentialDto.
[32mUploading tmp17gseds6 (18.27 MBs): 100%|██████████| 18269693/18269693 [00:10<00:00, 1726619.59it/s]
[39m

