In [18]:
import pandas as pd
import random
from utils import *
random.seed(0)

config = load_config()

Decided to prompt using the fewshot prompt that was already developed.

In [19]:
# prepare 1000 examples of MIMIC notes
# use filtered discharge dataset which was used in mimicv_keyword_to_json.ipynb


PROJECT_PATH = config.project_path
DATA_PATH = PROJECT_PATH.joinpath("data/processed")

filtered_discharge_dataset = pd.read_pickle(DATA_PATH.joinpath('filtered_discharge_dataset.pkl'))

In [20]:
# select 1000 mimic notes
selected_mimic = filtered_discharge_dataset.sample(1000, random_state=0)
selected_mimic = selected_mimic[['note_id', 'subject_id', 'text']].reset_index(drop=True)

In [17]:
selected_mimic.to_pickle(DATA_PATH.joinpath("selected_mimic.pkl"))

## Examples from ChatGPT

In [21]:

import pandas as pd
import random
from utils import *

random.seed(0)

config = load_config()

In [22]:
PROJECT_PATH = config.project_path
DATA_PATH = PROJECT_PATH.joinpath("data/processed")

synthetic_data = pd.read_pickle(DATA_PATH.joinpath('selected_mimic_synthetic_data.pkl'))

In [23]:
synthetic_data.head()

Unnamed: 0,note_id,subject_id,text,gpt_output
0,10332328-DS-27,10332328,\nName: ___ Unit No: __...,1. bilateral lower extremity edema\n2. chronic...
1,10030852-DS-18,10030852,\nName: ___ Unit No: ___\n ...,1. T1DM (Type 1 Diabetes Mellitus)\n1.1. Insul...
2,10092175-DS-10,10092175,\nName: ___ Unit No: _...,1. dyspnea\n2. orthopnea\n3. PND\n4. weight ga...
3,10215754-DS-14,10215754,\nName: ___ Unit No: ___...,1. contractions\t\n2. back pain\n3. R hydronep...
4,10089894-DS-16,10089894,\nName: ___ Unit No: ___\...,1. trauma\t\n2. periprosthetic femur fracture\...


In [17]:
import json
import re
from utils import format_prompt


def parse_dataset_into_json(row_n, topn) :
    text = synthetic_data.iloc[row_n]['text']
    output = synthetic_data.iloc[row_n]['gpt_output']
    rank = int(topn.replace("top",""))

    p = re.compile('(\d+\.\d?)\.?\s(.+)')
    output = p.findall(output)

    numbers = list(map(lambda x : x[0], output))
    output = list(map(lambda x : x[1].strip().lower(), output))

    output = [(x,y) for x, y in zip(numbers, output)]
    output = list(filter(lambda x : int(x[0].rstrip(".")[0]) < (rank + 1), output))
    output_text = ""
    for x, y in output :
        output_text += x + " " + y + "\n"

    with open(PROJECT_PATH.joinpath(f"prompts/finetune_instruction_{topn}.txt")) as f :
        instruction = f.read()

    formated_terms = {"input" : text,
                    "output" : output_text, 
                    "instruction" : instruction}

    return formated_terms

parse_dataset_into_json(0, "top3")

 'output': '1. bilateral lower extremity edema\n2. chronic respiratory failure\n2.1 home o2\n3. restrictive lung disease\n3.1 budesonide\n',
 'instruction': "\nYou are a helpful assistant, an expert in medical domain. \nExtract top 3 main diagnosis/symptoms or conditions mentioned in the medical note. \nFollowing the diagnosis/symptoms or conditions, identify the medical tests related to it.\nIf there isn't any medical tests related to it, just start listing the next important diagnosis/symptoms or conditions.\nIf there are no additional diagnosis/symptoms or conditions that you can identify, just list the existing ones and finalize the output. \nDon't write no symptoms, or any indication that there is no other diagnosis/symptoms or conditions.\nDo not modify or abbreviate what is written in the notes. Just extract them as they are.\nMake sure the highest priority is assigned with a smaller number.\nWe give you an example, do follow as below.\nThe format should be as follows\n\n1. key 

In [42]:
import random
import numpy as np

np.random.seed(0)
random.seed(0)

parsed_dataset = []
topns = ["top3", "top5", "top10"]

for idx in range(len(synthetic_data)) : 
    topn= np.random.choice(topns)
    output = parse_dataset_into_json(idx, topn)
    parsed_dataset.append(output)

In [43]:
# random.seed(0)

random.Random(0).shuffle(parsed_dataset)
parsed_dataset[0]

{'input': ' \nName:  ___               Unit No:   ___\n \nAdmission Date:  ___              Discharge Date:   ___\n \nDate of Birth:  ___             Sex:   M\n \nService: MEDICINE\n \nAllergies: \nNo Known Allergies / Adverse Drug Reactions\n \nAttending: ___\n \nChief Complaint:\nL-sided flank pain and cough\n \nMajor Surgical or Invasive Procedure:\nNone\n\n \nHistory of Present Illness:\n___ with DM, HTN, TIA, tobacco abuse, obesity, and asthma who \npresented to clinic with cough and atypical chest pain. He had a \nURI three weeks ago and recovered but has residual persistent \ncough x 3 weeks productive of yellow sputum. He also has had \nL-sided chest pain x 1 week, described as dull, throbbing \npersistent pain, not pleuritic and not associated with exertion \nor positional change. It does not radiate and he has had no SOB, \nnausea, lightheadedness. Patient was seen in clinic today with \nEKG which showed new TWI in I, II, aVL, V3, prompting transfer \nto ED. He takes full dos

In [44]:
idx = int(len(parsed_dataset)*0.2)
testset, trainset = parsed_dataset[:idx], parsed_dataset[idx:]

In [45]:
len(testset), len(trainset)

(200, 800)

In [47]:
with open(DATA_PATH.joinpath("discharge_dataset.json"), "w") as f : 
    json.dump(parsed_dataset, f)

with open(DATA_PATH.joinpath("discharge_dataset_train.json"), "w") as f : 
    json.dump(trainset, f)

with open(DATA_PATH.joinpath("discharge_dataset_test.json"), "w") as f : 
    json.dump(testset, f)