In [12]:
from utils import *

config = load_config()
PROJECT_PATH = config.project_path
DATA_PATH = PROJECT_PATH.joinpath("data/processed")

In [13]:
import pandas as pd
import pickle

pitts = pd.read_pickle(DATA_PATH.joinpath("pittsburgh_chat.pkl"))
mimic = pd.read_pickle(DATA_PATH.joinpath("mimic_chat.pkl"))

In [14]:
pitts['data_source'] = "pitts"
mimic['data_source'] = "mimic"

In [15]:
pitts = pitts.rename(columns = {"reportID" : "note_id"})
pitts = pitts[['data_source', 'note_id', 'texts', 'reformatted_output']]

In [16]:
mimic = mimic[['data_source','note_id', 'texts', 'reformatted_output']]

In [17]:
pitts_and_mimic = pd.concat([pitts, mimic],ignore_index=True)

In [18]:
pitts.head()

Unnamed: 0,data_source,note_id,texts,reformatted_output
0,pitts,report7660,[ Report de-identified ( Safe-harbor compliant...,"{'messages': [{'role': 'system', 'content': 'Y..."
1,pitts,report4725,[ Report de-identified ( Safe-harbor compliant...,"{'messages': [{'role': 'system', 'content': 'Y..."
2,pitts,report5005,[ Report de-identified ( Safe-harbor compliant...,"{'messages': [{'role': 'system', 'content': 'Y..."
3,pitts,report2854,[ Report de-identified ( Safe-harbor compliant...,"{'messages': [{'role': 'system', 'content': 'Y..."
4,pitts,report6420,[ Report de-identified ( Safe-harbor compliant...,"{'messages': [{'role': 'system', 'content': 'Y..."


In [19]:
pitts['reformatted_output'][1]

{'messages': [{'role': 'system',
   'content': 'You are a helpful assistant trained for healthcare-related text processing'},
  {'role': 'doctor', 'content': 'Hi, how can I help you today?'},
  {'role': 'patient',
   'content': 'I have some questions about my discharge instructions.'},
  {'role': 'doctor',
   'content': 'Sure, let\'s go through them together. Firstly, you\'ve been diagnosed with osteoarthritis in your right hip and you\'ve had hip replacement surgery on February 3rd. You were in the hospital for a few days and it sounds like your recovery is going well. (Primary Diagnosis: "Osteoarthritis of the right hip." Surgical Procedure: "Right total hip arthroplasty under spinal anesthesia.")'},
  {'role': 'patient',
   'content': "That's right. They did say I was recovering well."},
  {'role': 'doctor',
   'content': 'Exactly. Post-surgery, you\'ll need to continue doing physical and occupational therapy exercises at home to help with your movement and recovery. ("The patient i

In [20]:
def format_input_text_for_training(row) :
    conversation = ""
    for turn in row['reformatted_output']['messages'] :
        if turn['role'] == 'system' :
            conversation += f"""You are a helpful assistant in healthcare. Here is the patient's discharge note. 
### Discharge note :
{row['texts']}
\n\n
            """
        conversation += f"{turn['role']} : {turn['content']}\n"
    return conversation

In [21]:
def format_for_sft(row) :
    discharge_note = row.texts
    conversation = row.reformatted_output
    if conversation['messages'][0]['role'] == 'system' :
        system_message = f"You are a helpful assistant trained for healthcare. Here is the patient's discharge note. \n\n {discharge_note}"
        conversation['messages'][0]['content'] = system_message
    for conv in row.reformatted_output['messages'] :
        if conv['role'] == 'doctor' :
            conv['role'] = 'assistant'
        elif conv['role'] == 'patient' :
            conv['role'] = 'user'

    return conversation

In [22]:
pitts_and_mimic['conversation'] = pitts_and_mimic.apply(format_for_sft,axis=1)


In [25]:
import json, jsonlines

conversation = pitts_and_mimic['conversation'].to_list()

with jsonlines.open(DATA_PATH.joinpath("conversation_dataset.jsonl"), 'w') as f :
    json.dump(conversation,f)

In [64]:
# test the saved dataset
with open(DATA_PATH.joinpath("conversation_dataset.json"), 'r') as f :
    conversation = json.load(f)

In [65]:
import random
random.seed(42)

random.shuffle(conversation)

In [66]:
idx = round(len(conversation) * 0.8)
train, test = conversation[:idx], conversation[idx:]

In [67]:
import jsonlines

In [68]:
with jsonlines.open(DATA_PATH.joinpath("train_conversation.jsonl"), 'w') as f :
    f.write(train)
with jsonlines.open(DATA_PATH.joinpath("test_conversation.jsonl"), 'w') as f :
    f.write(test)

In [84]:
test[2000]

{'messages': [{'role': 'system',
   'content': "You are a helpful assistant trained for healthcare. Here is the patient's discharge note. \n\n  \nName:  ___             Unit No:   ___\n \nAdmission Date:  ___              Discharge Date:   ___\n \nDate of Birth:  ___             Sex:   F\n \nService: NEUROLOGY\n \nAllergies: \nNo Known Allergies / Adverse Drug Reactions\n \nAttending: ___.\n \nChief Complaint:\nmutism, unresponsivess; called as CODE STROKE. \n\n \nMajor Surgical or Invasive Procedure:\nlumbar puncture\n \nHistory of Present Illness:\nMs. ___ is currently mute and unable to provide\nhistory; following history obtained from EMS report and medical\nrecords. \n\n  Ms. ___ is a ___ year-old woman with PMH significant for\na. fib (on aspirin) and schizophrenia (on seroquel and haldol)\nwith recent discharge for catatoinc state due to\nschizophrenia(mute, unresponsive to commands; at time of\ndischarge she was able to hold conversation), who presents today\nafter witnessed fa

In [46]:
pitts_and_mimic.to_pickle(DATA_PATH.joinpath("pitts_and_mimic.pkl"))

In [47]:
import pandas as pd

df = pd.read_pickle("../data/processed/pitts_and_mimic.pkl")

In [95]:
from datasets import Dataset, load_dataset

data = load_dataset("json", data_files=DATA_PATH.joinpath("test_conversation.jsonl").as_posix())

Generating train split: 0 examples [00:00, ? examples/s]

Generating train split: 3558 examples [00:00, 3668.93 examples/s]


In [27]:
model_path = config.model_path('llama3.2-1B')
model, tokenizer = config.load_model(model_path)


In [None]:
df = data.map(lambda x : {"formatted_chat" : tokenizer.apply_chat_template(x["messages"], tokenize=False, add_generation_prompt=False)})

Map: 100%|██████████| 14231/14231 [00:04<00:00, 3091.45 examples/s]
Map: 100%|██████████| 3558/3558 [00:01<00:00, 2648.98 examples/s]


DatasetDict({
    train: Dataset({
        features: ['messages', 'formatted_chat'],
        num_rows: 14231
    })
    test: Dataset({
        features: ['messages', 'formatted_chat'],
        num_rows: 3558
    })
})

In [59]:
print(df['conversation_text'][0])

You are a helpful assistant in healthcare. Here is the patient's discharge note. 
### Discharge note :
[ Report de-identified ( Safe-harbor compliant ) by De-ID v.6.22.07.0]



**INSTITUTION GENERAL MEDICINE DISCHARGE SUMMARY PATIENT NAME : **NAME[AAA , BBB M] ACCOUNT # : **ID-NUM **ROOM ATTENDING PHYSICIAN : **NAME[YYY M ZZZ] ADMISSION DATE : **DATE[ Jul 06 2007] DISCHARGE DATE : **DATE[ Jul 11 2007] CHIEF COMPLAINT : Report of confusion .
HISTORY OF PRESENT ILLNESS : The patient is a **AGE[in 50s]-year-old gentleman with a past medical history significant for hepatic encephalopathy who is not very compliant with his lactulose because it gives him diarrhea .
He reported to the EDS after complaining of subjective feelings of confusion .
The patient does have a history of being admitted in the recent past , last time on **DATE[ Jun 14 2007] , and the time prior **DATE[ May 26 2007] .
In the ED , he was found to be relatively near baseline .
He denied any history of fall or trauma , feve

In [44]:
train[0]

{'messages': [{'role': 'system',
  {'role': 'doctor', 'content': 'Hi, how can I help you today?'},
  {'role': 'patient',
   'content': "I have some questions about my discharge instructions. I'm trying to understand what I need to focus on after leaving the hospital."},
  {'role': 'doctor',
   'content': 'Sure! Let\'s go through the main points. You were admitted because of muscle aches and a fever due to a weak immune system from your treatments. We gave you antibiotics as a precaution, but no infection was found, and you didn\'t have more fevers while you were here. (Evidence: "You were admitted to ___ because you were having muscle aches and developed a fever to 100.4 degrees... no source of infection was found and you did not have any more fevers while you were here.")'},
  {'role': 'patient',
   'content': "Okay, that's good to know. What should I be doing at home?"},
  {'role': 'doctor',
   'content': 'Your immune system is still weak, so it\'s important to follow a strict neutro

In [None]:
from datasets import load_dataset

data = load_dataset("philschmid/dolly-15k-oai-style",split="train")

Generating train split: 100%|██████████| 15011/15011 [00:00<00:00, 116476.33 examples/s]


In [3]:
from datasets import load_dataset
from utils import * 
config = load_config()
PROJECT_PATH = config.project_path
DATA_PATH = PROJECT_PATH.joinpath("data/processed")

data = load_dataset("json", data_files={"train" : DATA_PATH.joinpath("train_conversation.jsonl").as_posix(),
                                    "test" : DATA_PATH.joinpath("test_conversation.jsonl").as_posix()})